Merge remote-tracking branch 'jekstrand/wip/i965-uniforms' into vulkan

author Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>

Thu, 21 Jan 2016 19:07:09 +0000 (11:07 -0800)

committer Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>

Thu, 21 Jan 2016 19:09:58 +0000 (11:09 -0800)
author Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
Thu, 21 Jan 2016 19:07:09 +0000 (11:07 -0800)
committer Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
Thu, 21 Jan 2016 19:09:58 +0000 (11:09 -0800)
diff --git a/README.intel-vulkan.txt b/README.intel-vulkan.txt

new file mode 100644 (file)

index 0000000..8d8a7f5
--- /dev/null
+++ b/README.intel-vulkan.txt
@@ -0,0 +1,135 @@
+Intel's Open Source Vulkan Driver
+Vulkan API Version: 0.210.1
+SPIR-V Version: 1.0
+
+Intro
+=====
+The Open Source Technology Center 3D graphics team at Intel has
+been working on a Vulkan implementation based on the Mesa open source
+OpenGL implementation. At this point we're ready to share what we have
+with our Khronos friends, in the hope that an early preview will be
+useful.
+
+When the Vulkan specification goes public, we will continue the work
+in the Mesa public git repository, but in the interim we will share
+our progress on the 'vulkan' branch in the 'mesa' repository in
+Khronos gitlab.
+
+The Mesa project source and our driver implementation is under the MIT
+license [1], but is also covered by the Khronos IP framework as it
+pertains to a specification under construction [2].
+
+We welcome all feedback and contibutions, as long as the contributions
+are MIT licensed and can be open sourced with the driver.
+
+[1] https://opensource.org/licenses/MIT
+[2] https://www.khronos.org/members/ip-framework
+
+
+Maintainers
+===========
+Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
+Jason Ekstrand <jason.ekstrand@intel.com>
+Chad Versace <chad.versace@intel.com>
+
+
+Supported Hardware
+==================
+- Broadwell, main development focus
+- Ivybridge
+- Sky Lake
+- Haswell
+- Bay Trail
+- Cherryview
+- Broxton
+
+
+Supported OS Platforms
+======================
+ - Linux, tested on Fedora 22 with kernel >= 4.1
+     - X11 with DRI3
+     - Wayland
+ - Android
+     - TODO
+
+
+Building and Installing
+=======================
+This driver is intended to be used directly from the build tree. Installing the
+driver into a system location is not yet fully supported. If you require support
+for system-wide installation, please contact a maintainer.
+
+Throughout the instructions, MESA_TOP refers to the top of the Mesa repository.
+
+First, install the usual dependencies needed to build Mesa.
+
+        Fedora:
+            $ sudo yum builddep mesa
+        Ubunutu:
+            $ FINISHME
+
+Next, configure and build. The below commands will build Mesa in release mode.
+If you wish to build Mesa in debug mode, add option '--enable-debug' to the
+configure command.
+
+        $ cd $MESA_TOP
+        $ autoreconf -vfi
+        $ ./configure --with-dri-drivers=i965 --with-gallium-drivers=
+        $ make
+
+To use the driver's libvulkan.so directly, without LunarG's loader, you must set
+an environment variable before running your Vulkan application:
+
+        $ export LD_LIBRARY_PATH="$MESA_TOP/lib"
+        $ your-vk-app
+
+Alternatively, to use the driver with LunarG's loader:
+
+        $ export VK_ICD_FILENAMES="$MESA_TOP/src/vulkan/anv_icd.json"
+        $ your-vk-app
+
+
+File Structure and Naming
+=========================
+The core code of Intel's Mesa Vulkan driver lives in src/vulkan. Files prefixed
+with "gen8" support Broadwell; files prefixed with "gen7" support Ivybridge;
+files prefixed with "anv" are common to all hardware generations.
+
+Mesa is an umbrella open source project containing many drivers for multiple
+APIs. The codename for Intel's Mesa Vulkan driver is "Anvil", hence the filename
+prefix "anv".
+
+
+Feature Status
+==============
+The driver is still a work-in-progress. We do our best to keep the below list of
+features up-to-date.
+
+Supported Features:
+  - Index buffers, instanced draw, indirect draw
+  - Nested command buffers
+  - Consumes SPIR-V (no GLSL "backdoor")
+  - Fragment, vertex, geometry, and compute shaders
+  - Uniform buffers, sampled images, dynamic uniform buffers
+  - Shader storage buffers
+  - Push constants (VS and FS only)
+  - Color, depth and stencil attachments
+  - 1D, 2D, 3D textures, texture arrays
+  - Memory barrier
+  - Optionally integrates with LunarGs loader
+  - WSI extension for X11
+  - Fences
+  - Most copy/blit commands for color and depth buffers,
+    vkCmdCopyImageToBuffer for stencil buffers
+  - Occlution query and timestamps
+  - VkkSemaphore and VkEvent
+
+Unsupported Features:
+   - Shader specialization
+   - Storage images
+   - Tesselation shaders
+   - Push constants in GS and CS (and VS on HSW and prior)
+   - Sparse resources
+   - MSAA
+   - vkCmdClear commands
+   - Input attachments
diff --git a/configure.ac b/configure.ac

index b6680d0..44d16c6 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -72,8 +72,8 @@ LIBDRM_REQUIRED=2.4.60
  LIBDRM_RADEON_REQUIRED=2.4.56
  LIBDRM_AMDGPU_REQUIRED=2.4.63
  LIBDRM_INTEL_REQUIRED=2.4.61
-LIBDRM_NVVIEUX_REQUIRED=2.4.33
-LIBDRM_NOUVEAU_REQUIRED=2.4.62
+LIBDRM_NVVIEUX_REQUIRED=2.4.66
+LIBDRM_NOUVEAU_REQUIRED=2.4.66
  LIBDRM_FREEDRENO_REQUIRED=2.4.65
  DRI2PROTO_REQUIRED=2.6
  DRI3PROTO_REQUIRED=1.0
@@ -98,7 +98,7 @@ AC_PROG_CXX
  AM_PROG_CC_C_O
  AM_PROG_AS
  AX_CHECK_GNU_MAKE
-AC_CHECK_PROGS([PYTHON2], [python2 python])
+AC_CHECK_PROGS([PYTHON2], [python2.7 python2 python])
  AC_PROG_SED
  AC_PROG_MKDIR_P
  
@@ -245,7 +245,7 @@ _SAVE_LDFLAGS="$LDFLAGS"
  _SAVE_CPPFLAGS="$CPPFLAGS"
  
  dnl Compiler macros
-DEFINES="-D__STDC_LIMIT_MACROS"
+DEFINES="-D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS"
  AC_SUBST([DEFINES])
  case "$host_os" in
  linux*|*-gnu*|gnu*)
@@ -383,10 +383,11 @@ save_CFLAGS="$CFLAGS"
  CFLAGS="$SSE41_CFLAGS $CFLAGS"
  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
  #include <smmintrin.h>
+int param;
  int main () {
-    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
      c = _mm_max_epu32(a, b);
-    return 0;
+    return _mm_cvtsi128_si32(c);
  }]])], SSE41_SUPPORTED=1)
  CFLAGS="$save_CFLAGS"
  if test "x$SSE41_SUPPORTED" = x1; then
@@ -395,6 +396,61 @@ fi
  AM_CONDITIONAL([SSE41_SUPPORTED], [test x$SSE41_SUPPORTED = x1])
  AC_SUBST([SSE41_CFLAGS], $SSE41_CFLAGS)
  
+dnl Check for Endianness
+AC_C_BIGENDIAN(
+   little_endian=no,
+   little_endian=yes,
+   little_endian=no,
+   little_endian=no
+)
+
+dnl Check for POWER8 Architecture
+PWR8_CFLAGS="-mpower8-vector"
+have_pwr8_intrinsics=no
+AC_MSG_CHECKING(whether gcc supports -mpower8-vector)
+save_CFLAGS=$CFLAGS
+CFLAGS="$PWR8_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
+#error "Need GCC >= 4.8 for sane POWER8 support"
+#endif
+#include <altivec.h>
+int main () {
+    vector unsigned char r;
+    vector unsigned int v = vec_splat_u32 (1);
+    r = __builtin_vec_vgbbd ((vector unsigned char) v);
+    return 0;
+}]])], have_pwr8_intrinsics=yes)
+CFLAGS=$save_CFLAGS
+
+AC_ARG_ENABLE(pwr8,
+   [AC_HELP_STRING([--disable-pwr8-inst],
+                   [disable POWER8-specific instructions])],
+   [enable_pwr8=$enableval], [enable_pwr8=auto])
+
+if test "x$enable_pwr8" = xno ; then
+   have_pwr8_intrinsics=disabled
+fi
+
+if test $have_pwr8_intrinsics = yes && test $little_endian = yes ; then
+   DEFINES="$DEFINES -D_ARCH_PWR8"
+   CXXFLAGS="$CXXFLAGS $PWR8_CFLAGS"
+   CFLAGS="$CFLAGS $PWR8_CFLAGS"
+else
+   PWR8_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_pwr8_intrinsics)
+if test "x$enable_pwr8" = xyes && test $have_pwr8_intrinsics = no ; then
+   AC_MSG_ERROR([POWER8 compiler support not detected])
+fi
+
+if test $have_pwr8_intrinsics = yes && test $little_endian = no ; then
+   AC_MSG_WARN([POWER8 optimization is enabled only on POWER8 Little-Endian])
+fi
+
+AC_SUBST([PWR8_CFLAGS], $PWR8_CFLAGS)
+
  dnl Can't have static and shared libraries, default to static if user
  dnl explicitly requested. If both disabled, set to static since shared
  dnl was explicitly requested.
@@ -420,8 +476,29 @@ AC_ARG_ENABLE([debug],
      [enable_debug="$enableval"],
      [enable_debug=no]
  )
+
+AC_ARG_ENABLE([profile],
+    [AS_HELP_STRING([--enable-profile],
+        [enable profiling of code @<:@default=disabled@:>@])],
+    [enable_profile="$enableval"],
+    [enable_profile=no]
+)
+
+if test "x$enable_profile" = xyes; then
+    DEFINES="$DEFINES -DPROFILE"
+    if test "x$GCC" = xyes; then
+        CFLAGS="$CFLAGS -fno-omit-frame-pointer"
+    fi
+    if test "x$GXX" = xyes; then
+        CXXFLAGS="$CXXFLAGS -fno-omit-frame-pointer"
+    fi
+fi
+
  if test "x$enable_debug" = xyes; then
      DEFINES="$DEFINES -DDEBUG"
+    if test "x$enable_profile" = xyes; then
+        AC_MSG_WARN([Debug and Profile are enabled at the same time])
+    fi
      if test "x$GCC" = xyes; then
          if ! echo "$CFLAGS" | grep -q -e '-g'; then
              CFLAGS="$CFLAGS -g"
@@ -1565,6 +1642,8 @@ GBM_PC_LIB_PRIV="$DLOPEN_LIBS"
  AC_SUBST([GBM_PC_REQ_PRIV])
  AC_SUBST([GBM_PC_LIB_PRIV])
  
+AM_CONDITIONAL(HAVE_VULKAN, true)
+
  dnl
  dnl EGL configuration
  dnl
@@ -1712,7 +1791,15 @@ AC_ARG_WITH([clang-libdir],
     [CLANG_LIBDIR=''])
  
  PKG_CHECK_EXISTS([libclc], [have_libclc=yes], [have_libclc=no])
-AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;ELF_LIB=-lelf])
+PKG_CHECK_MODULES([LIBELF], [libelf], [have_libelf=yes], [have_libelf=no])
+
+if test "x$have_libelf" = xno; then
+   LIBELF_LIBS=''
+   LIBELF_CFLAGS=''
+   AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;LIBELF_LIBS=-lelf], [have_libelf=no])
+   AC_SUBST([LIBELF_LIBS])
+   AC_SUBST([LIBELF_CFLAGS])
+fi
  
  if test "x$enable_opencl" = xyes; then
      if test -z "$with_gallium_drivers"; then
@@ -2299,8 +2386,6 @@ if test "x$USE_VC4_SIMULATOR" = xyes -a "x$HAVE_GALLIUM_ILO" = xyes; then
      AC_MSG_ERROR([VC4 simulator on x86 replaces i965 driver build, so ilo must be disabled.])
  fi
  
-AC_SUBST([ELF_LIB])
-
  AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
  AM_CONDITIONAL(HAVE_X11_DRIVER, test "x$enable_xlib_glx" = xyes)
  AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
@@ -2336,6 +2421,13 @@ AC_SUBST([XA_MINOR], $XA_MINOR)
  AC_SUBST([XA_TINY], $XA_TINY)
  AC_SUBST([XA_VERSION], "$XA_MAJOR.$XA_MINOR.$XA_TINY")
  
+PKG_CHECK_MODULES(VALGRIND, [valgrind],
+                  [have_valgrind=yes], [have_valgrind=no])
+if test "x$have_valgrind" = "xyes"; then
+    AC_DEFINE([HAVE_VALGRIND], 1,
+              [Use valgrind intrinsics to suppress false warnings])
+fi
+
  dnl Restore LDFLAGS and CPPFLAGS
  LDFLAGS="$_SAVE_LDFLAGS"
  CPPFLAGS="$_SAVE_CPPFLAGS"
@@ -2427,6 +2519,7 @@ AC_CONFIG_FILES([Makefile
                 src/glx/apple/Makefile
                 src/glx/tests/Makefile
                 src/gtest/Makefile
+               src/isl/Makefile
                 src/loader/Makefile
                 src/mapi/Makefile
                 src/mapi/es1api/glesv1_cm.pc
@@ -2448,6 +2541,9 @@ AC_CONFIG_FILES([Makefile
                 src/mesa/drivers/osmesa/osmesa.pc
                 src/mesa/drivers/x11/Makefile
                 src/mesa/main/tests/Makefile
+               src/vulkan/Makefile
+               src/vulkan/anv_icd.json
+               src/vulkan/tests/Makefile
                 src/util/Makefile
                 src/util/tests/hash_table/Makefile])
  
diff --git a/docs/GL3.txt b/docs/GL3.txt

index 84b5a17..f12e0ba 100644 (file)
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -112,7 +112,7 @@ GL 4.0, GLSL 4.00 --- all DONE: nvc0, r600, radeonsi
    GL_ARB_gpu_shader_fp64                               DONE (llvmpipe, softpipe)
    GL_ARB_sample_shading                                DONE (i965, nv50)
    GL_ARB_shader_subroutine                             DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_tessellation_shader                           DONE ()
+  GL_ARB_tessellation_shader                           DONE (i965)
    GL_ARB_texture_buffer_object_rgb32                   DONE (i965, llvmpipe, softpipe)
    GL_ARB_texture_cube_map_array                        DONE (i965, nv50, llvmpipe, softpipe)
    GL_ARB_texture_gather                                DONE (i965, nv50, llvmpipe, softpipe)
@@ -184,7 +184,7 @@ GL 4.4, GLSL 4.40:
    - forced alignment within blocks                     in progress
    - specified vec4-slot component numbers              in progress
    - specified transform/feedback layout                in progress
-  - input/output block locations                       in progress
+  - input/output block locations                       DONE
    GL_ARB_multi_bind                                    DONE (all drivers)
    GL_ARB_query_buffer_object                           not started
    GL_ARB_texture_mirror_clamp_to_edge                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
@@ -233,7 +233,7 @@ GLES3.1, GLSL ES 3.1
        glMemoryBarrierByRegion                          DONE
        glGetTexLevelParameter[fi]v - needs updates      DONE
        glGetBooleani_v - restrict to GLES enums
-      gl_HelperInvocation support
+      gl_HelperInvocation support                      DONE (i965, nvc0, r600)
  
  GLES3.2, GLSL ES 3.2
    GL_EXT_color_buffer_float                            DONE (all drivers)
diff --git a/docs/contents.html b/docs/contents.html

index 6612cbe..294804f 100644 (file)
--- a/docs/contents.html
+++ b/docs/contents.html
@@ -90,14 +90,14 @@
  <li><a href="http://www.opengl.org" target="_parent">OpenGL website</a>
  <li><a href="http://dri.freedesktop.org" target="_parent">DRI website</a>
  <li><a href="http://www.freedesktop.org" target="_parent">freedesktop.org</a>
+<li><a href="http://planet.freedesktop.org" target="_parent">Developer blogs</a>
  </ul>
  
  <b>Hosted by:</b>
  <br>
  <blockquote>
  <a href="http://sourceforge.net"
-target="_parent"><img src="http://sourceforge.net/sflogo.php?group_id=3&amp;type=1"
-width="88" height="31" align="bottom" alt="Sourceforge.net" border="0"></a>
+target="_parent">sourceforge.net</a>
  </blockquote>
  
  </body>
diff --git a/docs/index.html b/docs/index.html

index 6b1221d..4c6b276 100644 (file)
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,25 @@
  
  <h1>News</h1>
  
+<h2>January 13, 2015</h2>
+<p>
+<a href="relnotes/11.1.1.html">Mesa 11.1.1</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>December 21, 2015</h2>
+<p>
+<a href="relnotes/11.0.8.html">Mesa 11.0.8</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>December 15, 2015</h2>
+<p>
+<a href="relnotes/11.1.0.html">Mesa 11.1.0</a> is released.  This is a new
+development release.  See the release notes for more information about
+the release.
+</p>
+
  <h2>December 9, 2015</h2>
  <p>
  <a href="relnotes/11.0.7.html">Mesa 11.0.7</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html

index a8aaa5f..6ae05b6 100644 (file)
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,9 @@ The release notes summarize what's new or changed in each Mesa release.
  </p>
  
  <ul>
+<li><a href="relnotes/11.1.1.html">11.1.1 release notes</a>
+<li><a href="relnotes/11.0.8.html">11.0.8 release notes</a>
+<li><a href="relnotes/11.1.0.html">11.1.0 release notes</a>
  <li><a href="relnotes/11.0.7.html">11.0.7 release notes</a>
  <li><a href="relnotes/11.0.6.html">11.0.6 release notes</a>
  <li><a href="relnotes/11.0.5.html">11.0.5 release notes</a>
diff --git a/docs/relnotes/11.0.8.html b/docs/relnotes/11.0.8.html

new file mode 100644 (file)

index 0000000..ce6e1d6
--- /dev/null
+++ b/docs/relnotes/11.0.8.html
@@ -0,0 +1,200 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.8 Release Notes / December 9, 2015</h1>
+
+<p>
+Mesa 11.0.8 is a bug fix release which fixes bugs found since the 11.0.7 release.
+</p>
+<p>
+Mesa 11.0.8 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+ab9db87b54d7525e4b611b82577ea9a9eae55927558df57b190059d5ecd9406f  mesa-11.0.8.tar.gz
+5696e4730518b6805d2ed5def393c4293f425a2c2c01bd5ed4bdd7ad62f7ad75  mesa-11.0.8.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91806">Bug 91806</a> - configure does not test whether assembler supports sse4.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92849">Bug 92849</a> - [IVB HSW BDW] piglit image load/store load-from-cleared-image.shader_test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92909">Bug 92909</a> - Offset/alignment issue with layout std140 and vec3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93004">Bug 93004</a> - Guild Wars 2 crash on nouveau DX11 cards</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93215">Bug 93215</a> - [Regression bisected] Ogles1conform Automatic mipmap generation test is fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93266">Bug 93266</a> - gl_arb_shading_language_420pack does not allow binding of image variables</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Boyuan Zhang (1):</p>
+<ul>
+  <li>radeon/uvd: uv pitch separation for stoney</li>
+</ul>
+
+<p>Dave Airlie (9):</p>
+<ul>
+  <li>r600: do SQ flush ES ring rolling workaround</li>
+  <li>r600: SMX returns CONTEXT_DONE early workaround</li>
+  <li>r600/shader: split address get out to a function.</li>
+  <li>r600/shader: add utility functions to do single slot arithmatic</li>
+  <li>r600g: fix geom shader input indirect indexing.</li>
+  <li>r600: handle geometry dynamic input array index</li>
+  <li>radeonsi: handle doubles in lds load path.</li>
+  <li>mesa/varray: set double arrays to non-normalised.</li>
+  <li>mesa/shader: return correct attribute location for double matrix arrays</li>
+</ul>
+
+<p>Emil Velikov (8):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.7</li>
+  <li>cherry-ignore: don't pick a specific i965 formats patch</li>
+  <li>Revert "i965/nir: Remove unused indirect handling"</li>
+  <li>Revert "i965/state: Get rid of dword_pitch arguments to buffer functions"</li>
+  <li>Revert "i965/vec4: Use a stride of 1 and byte offsets for UBOs"</li>
+  <li>Revert "i965/fs: Use a stride of 1 and byte offsets for UBOs"</li>
+  <li>Revert "i965/vec4: Use byte offsets for UBO pulls on Sandy Bridge"</li>
+  <li>Update version to 11.0.8</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965: Resolve color and flush for all active shader images in intel_update_state().</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>meta/generate_mipmap: Work-around GLES 1.x problem with GL_DRAW_FRAMEBUFFER</li>
+</ul>
+
+<p>Ilia Mirkin (17):</p>
+<ul>
+  <li>freedreno/a4xx: support lod_bias</li>
+  <li>freedreno/a4xx: fix 5_5_5_1 texture sampler format</li>
+  <li>freedreno/a4xx: point regid to "red" even for alpha-only rb formats</li>
+  <li>nvc0/ir: fold postfactor into immediate</li>
+  <li>nv50/ir: deal with loops with no breaks</li>
+  <li>nv50/ir: the mad source might not have a defining instruction</li>
+  <li>nv50/ir: fix instruction permutation logic</li>
+  <li>nv50/ir: don't forget to mark flagsDef on cvt in txb lowering</li>
+  <li>nv50/ir: fix DCE to not generate 96-bit loads</li>
+  <li>nv50/ir: avoid looking at uninitialized srcMods entries</li>
+  <li>gk110/ir: fix imul hi emission with limm arg</li>
+  <li>gk104/ir: sampler doesn't matter for txf</li>
+  <li>gk110/ir: fix imad sat/hi flag emission for immediate args</li>
+  <li>nv50/ir: fix cutoff for using r63 vs r127 when replacing zero</li>
+  <li>nv50/ir: can't have predication and immediates</li>
+  <li>glsl: assign varying locations to tess shaders when doing SSO</li>
+  <li>ttn: add TEX2 support</li>
+</ul>
+
+<p>Jason Ekstrand (5):</p>
+<ul>
+  <li>i965/vec4: Use byte offsets for UBO pulls on Sandy Bridge</li>
+  <li>i965/fs: Use a stride of 1 and byte offsets for UBOs</li>
+  <li>i965/vec4: Use a stride of 1 and byte offsets for UBOs</li>
+  <li>i965/state: Get rid of dword_pitch arguments to buffer functions</li>
+  <li>i965/nir: Remove unused indirect handling</li>
+</ul>
+
+<p>Jonathan Gray (2):</p>
+<ul>
+  <li>configure.ac: use pkg-config for libelf</li>
+  <li>configure: check for python2.7 for PYTHON2</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>i965: Fix fragment shader struct inputs.</li>
+  <li>i965: Fix scalar vertex shader struct outputs.</li>
+</ul>
+
+<p>Marek Olšák (8):</p>
+<ul>
+  <li>radeonsi: fix occlusion queries on Fiji</li>
+  <li>radeonsi: fix a hang due to uninitialized border color registers</li>
+  <li>radeonsi: fix Fiji for LLVM &lt;= 3.7</li>
+  <li>radeonsi: don't call of u_prims_for_vertices for patches and rectangles</li>
+  <li>radeonsi: apply the streamout workaround to Fiji as well</li>
+  <li>gallium/radeon: fix Hyper-Z hangs by programming PA_SC_MODE_CNTL_1 correctly</li>
+  <li>tgsi/scan: add flag colors_written</li>
+  <li>r600g: write all MRTs only if there is exactly one output (fixes a hang)</li>
+</ul>
+
+<p>Matt Turner (1):</p>
+<ul>
+  <li>glsl: Allow binding of image variables with 420pack.</li>
+</ul>
+
+<p>Neil Roberts (2):</p>
+<ul>
+  <li>i965: Add MESA_FORMAT_B8G8R8X8_SRGB to brw_format_for_mesa_format</li>
+  <li>i965: Add B8G8R8X8_SRGB to the alpha format override</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>configura.ac: fix test for SSE4.1 assembler support</li>
+</ul>
+
+<p>Patrick Rudolph (2):</p>
+<ul>
+  <li>nv50,nvc0: fix use-after-free when vertex buffers are unbound</li>
+  <li>gallium/util: return correct number of bound vertex buffers</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>nvc0: free memory allocated by the prog which reads MP perf counters</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>i965: use _Shader to get fragment program when updating surface state</li>
+</ul>
+
+<p>Tom Stellard (2):</p>
+<ul>
+  <li>radeonsi: Rename si_shader::ls_rsrc{1,2} to si_shader::rsrc{1,2}</li>
+  <li>radeonsi/compute: Use the compiler's COMPUTE_PGM_RSRC* register values</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html

index 542f368..9dd21dd 100644 (file)
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -14,7 +14,7 @@
  <iframe src="../contents.html"></iframe>
  <div class="content">
  
-<h1>Mesa 11.1.0 Release Notes / TBD</h1>
+<h1>Mesa 11.1.0 Release Notes / 15 December 2015</h1>
  
  <p>
  Mesa 11.1.0 is a new development release.
@@ -33,7 +33,8 @@ because compatibility contexts are not supported.
  
  <h2>SHA256 checksums</h2>
  <pre>
-TBD.
+e3bc44be4df5e4dc728dfda7b55b1aaeadfce36eca6a367b76cc07598070cb2d  mesa-11.1.0.tar.gz
+9befe03b04223eb1ede177fa8cac001e2850292c8c12a3ec9929106afad9cf1f  mesa-11.1.0.tar.xz
  </pre>
  
  
@@ -84,11 +85,196 @@ Note: some of the new features are only available with certain drivers.
  
  <h2>Bug fixes</h2>
  
-TBD.
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=28130">Bug 28130</a> - vbo: premature flushing breaks GL_LINE_LOOP</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=38109">Bug 38109</a> - i915 driver crashes if too few vertices are submitted (Mesa 7.10.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=49779">Bug 49779</a> - Extra line segments in GL_LINE_LOOP</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=55552">Bug 55552</a> - Compile errors with --enable-mangling</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=71789">Bug 71789</a> - [r300g] Visuals not found in (default) depth = 24</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=79783">Bug 79783</a> - Distorted output in obs-studio where other vendors &quot;work&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=80821">Bug 80821</a> - When LIBGL_ALWAYS_SOFTWARE is set, KHR_create_context is not supported</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=81174">Bug 81174</a> - Gallium: GL_LINE_LOOP broken with more than 512 points</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=83508">Bug 83508</a> - [UBO] Assertion for array of blocks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84677">Bug 84677</a> - Triangle disappears with glPolygonMode GL_LINE</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86281">Bug 86281</a> - brw_meta_fast_clear (brw=brw&#64;entry=0x7fffd4097a08, fb=fb&#64;entry=0x7fffd40fa900, buffers=buffers&#64;entry=2, partial_clear=partial_clear&#64;entry=false)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86469">Bug 86469</a> - Unreal Engine demo doesn't run</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86720">Bug 86720</a> - [radeon] Europa Universalis 4 freezing during game start (10.3.3+, still broken on 11.0.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89014">Bug 89014</a> - PIPE_QUERY_GPU_FINISHED is not acting as expected on SI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90175">Bug 90175</a> - [hsw bisected][PATCH] atomic counters doesn't work for a binding point different to zero</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90348">Bug 90348</a> - Spilling failure of b96 merged value</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90631">Bug 90631</a> - Compilation failure for fragment shader with many branches on Sandy Bridge</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90734">Bug 90734</a> - glBufferSubData is corrupting data when buffer is &gt; 32k</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90887">Bug 90887</a> - PhiMovesPass in register allocator broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91044">Bug 91044</a> - piglit spec/egl_khr_create_context/valid debug flag gles* fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91114">Bug 91114</a> - ES3-CTS.gtf.GL3Tests.shadow.shadow_execution_vert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91254">Bug 91254</a> - (regresion) video using VA-API on Intel slow and freeze system with mesa 10.6 or 10.6.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91292">Bug 91292</a> - [BDW+] glVertexAttribDivisor not working in combination with glPolygonMode</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91342">Bug 91342</a> - Very dark textures on some objects in indoors environments in Postal 2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91526">Bug 91526</a> - World of Warcraft (on Wine) has UI corruption with nouveau</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91551">Bug 91551</a> - DXTn compressed normal maps produce severe artifacts on all NV5x and NVDx chipsets</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91596">Bug 91596</a> - EGL_KHR_gl_colorspace (v2) causes problem with Android-x86 GUI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91716">Bug 91716</a> - [bisected] piglit.shaders.glsl-vs-int-attrib regresses on 32 bit BYT, HSW, IVB, SNB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91718">Bug 91718</a> - piglit.spec.arb_shader_image_load_store.invalid causes intermittent GPU HANG</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91719">Bug 91719</a> - [SNB,HSW,BYT] dEQP regressions associated with using NIR for vertex shaders</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91726">Bug 91726</a> - R600 asserts in tgsi_cmp/make_src_for_op3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91780">Bug 91780</a> - Rendering issues with geometry shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91785">Bug 91785</a> - make check DispatchSanity_test.GLES31 regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91788">Bug 91788</a> - [HSW Regression] Synmark2_v6 Multithread performance case FPS reduced by 36%</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91847">Bug 91847</a> - glGenerateTextureMipmap not working (no errors) unless glActiveTexture(GL_TEXTURE1) is called before</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91857">Bug 91857</a> - Mesa 10.6.3 linker is slow</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91881">Bug 91881</a> - regression: GPU lockups since mesa-11.0.0_rc1 on RV620 (r600) driver</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91890">Bug 91890</a> - [nve7] witcher2: blurry image &amp; DATA_ERRORs (class 0xa097 mthd 0x2380/0x238c)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91898">Bug 91898</a> - src/util/mesa-sha1.c:250:25: fatal error: openssl/sha.h: No such file or directory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91927">Bug 91927</a> - [SKL] [regression] piglit compressed textures tests fail  with kernel upgrade</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91930">Bug 91930</a> - Program with GtkGLArea widget does not redraw</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91970">Bug 91970</a> - [BSW regression] dEQP-GLES3.functional.shaders.precision.int.highp_mul_vertex</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91985">Bug 91985</a> - [regression, bisected] FTBFS with commit f9caabe8f1: R600_UCP_CONST_BUFFER is undefined</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91993">Bug 91993</a> - Graphical glitch in Astromenace (open-source game).</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92009">Bug 92009</a> - ES3-CTS.gtf.GL3Tests.packed_pixels.packed_pixels fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92033">Bug 92033</a> - [SNB,regression,dEQP,bisected] functional.shaders.random tests regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92052">Bug 92052</a> - nir/nir_builder.h:79: error: expected primary-expression before ‘.’ token</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92054">Bug 92054</a> - make check gbm-symbols-check regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92066">Bug 92066</a> - [ILK,G45,regression] New assertion on BRW_MAX_MRF breaks ilk and g45</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92072">Bug 92072</a> - Wine breakage since d082c5324 (st/mesa: don't call st_validate_state in BlitFramebuffer)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92095">Bug 92095</a> - [Regression, bisected] arb_shader_atomic_counters.compiler.builtins.frag</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92122">Bug 92122</a> - [bisected, cts] Regression with Assault Android Cactus</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92124">Bug 92124</a> - shader_query.cpp:841:34: error: ‘strndup’ was not declared in this scope</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92183">Bug 92183</a> - linker.cpp:3187:46: error: ‘strtok_r’ was not declared in this scope</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92193">Bug 92193</a> - [SKL] ES2-CTS.gtf.GL2ExtensionTests.compressed_astc_texture.compressed_astc_texture fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92214">Bug 92214</a> - Flightgear crashes during splashboot with R600 driver, LLVM 3.7.0 and mesa 11.0.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92221">Bug 92221</a> - Unintended code changes in _mesa_base_tex_format commit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92265">Bug 92265</a> - Black windows in weston after update mesa to 11.0.2-1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92304">Bug 92304</a> - [cts] cts.shaders.negative conformance tests fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92363">Bug 92363</a> - [BSW/BDW] ogles1conform Gets test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92437">Bug 92437</a> - osmesa: Expose GL entry points for Windows build, via .def file</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92438">Bug 92438</a> - Segfault in pushbuf_kref when running the android emulator (qemu) on nv50</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92476">Bug 92476</a> - [cts] ES2-CTS.gtf.GL2ExtensionTests.egl_image.egl_image fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92588">Bug 92588</a> - [HSW,BDW,BSW,SKL-Y][GLES 3.1 CTS] ES31-CTS.arrays_of_arrays.InteractionFunctionCalls2 - assert</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92621">Bug 92621</a> - [G965 ILK G45] Regression: 24 piglit regressions in glsl-1.10</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92623">Bug 92623</a> - Differences in prog_data ignored when caching fragment programs (causes hangs)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92634">Bug 92634</a> - gallium's vl_mpeg12_decoder does not work with st/va</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92639">Bug 92639</a> - [Regression bisected] Ogles1conform mustpass.c fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92641">Bug 92641</a> - [SKL BSW] [Regression] Ogles1conform userclip.c fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92645">Bug 92645</a> - kodi vdpau interop fails since  mesa,meta: move gl_texture_object::TargetIndex initializations</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92705">Bug 92705</a> - [clover] fail to build with llvm-svn/clang-svn 3.8</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92709">Bug 92709</a> - &quot;LLVM triggered Diagnostic Handler: unsupported call to function ldexpf in main&quot; when starting race in stuntrally</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92738">Bug 92738</a> - Randon R7 240 doesn't work on 16KiB page size platform</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92744">Bug 92744</a> - [g965 Regression bisected] Performance regression and piglit assertions due to liveness analysis</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92770">Bug 92770</a> - [SNB, regression, dEQP] deqp-gles3.functional.shaders.discard.dynamic_loop_texture</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92824">Bug 92824</a> - [regression, bisected] `make check` dispatch-sanity broken by GL_EXT_buffer_storage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92849">Bug 92849</a> - [IVB HSW BDW] piglit image load/store load-from-cleared-image.shader_test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92859">Bug 92859</a> - [regression, bisected] validate_intrinsic_instr: Assertion triggered</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92860">Bug 92860</a> - [radeonsi][bisected] st/mesa: implement ARB_copy_image - Corruption in ARK Survival Evolved</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92900">Bug 92900</a> - [regression bisected] About 700 piglit regressions is what could go wrong</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92909">Bug 92909</a> - Offset/alignment issue with layout std140 and vec3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92985">Bug 92985</a> - Mac OS X build error &quot;ar: no archive members specified&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93015">Bug 93015</a> - Tonga Elemental segfault + VM faults since  radeon: implement r600_query_hw_get_result via function pointers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93048">Bug 93048</a> - [CTS regression] mesa af2723 breaks GL Conformance for debug extension</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93063">Bug 93063</a> - drm_helper.h:227:1: error: static declaration of ‘pipe_virgl_create_screen’ follows non-static declaration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93091">Bug 93091</a> - [opencl] segfault when running any opencl programs (like clinfo)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93126">Bug 93126</a> - wrongly claim supporting GL_EXT_texture_rg</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93180">Bug 93180</a> - [regression] arb_separate_shader_objects.active sampler conflict fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93235">Bug 93235</a> - [regression] dispatch sanity broken by GetPointerv</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93266">Bug 93266</a> - gl_arb_shading_language_420pack does not allow binding of image variables</li>
+
+</ul>
+
  
  <h2>Changes</h2>
  
-TBD.
+<li>MPEG4 decoding has been disabled by default in the VAAPI driver</li>
  
  </div>
  </body>
diff --git a/docs/relnotes/11.1.1.html b/docs/relnotes/11.1.1.html

new file mode 100644 (file)

index 0000000..3383f70
--- /dev/null
+++ b/docs/relnotes/11.1.1.html
@@ -0,0 +1,197 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.1.1 Release Notes / January 13, 2016</h1>
+
+<p>
+Mesa 11.1.1 is a bug fix release which fixes bugs found since the 11.1.0 release.
+</p>
+<p>
+Mesa 11.1.1 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+b15089817540ba0bffd0aad323ecf3a8ff6779568451827c7274890b4a269d58  mesa-11.1.1.tar.gz
+64db074fc514136b5fb3890111f0d50604db52f0b1e94ba3fcb0fe8668a7fd20  mesa-11.1.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91806">Bug 91806</a> - configure does not test whether assembler supports sse4.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92229">Bug 92229</a> - [APITRACE] SOMA have serious graphical errors</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92233">Bug 92233</a> - Unigine Heaven 4.0 silhuette run</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93004">Bug 93004</a> - Guild Wars 2 crash on nouveau DX11 cards</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93215">Bug 93215</a> - [Regression bisected] Ogles1conform Automatic mipmap generation test is fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93257">Bug 93257</a> - [SKL, bisected] ASTC dEQP tests segfault</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>st/mesa: check state-&gt;mesa in early return check in st_validate_state()</li>
+</ul>
+
+<p>Dave Airlie (6):</p>
+<ul>
+  <li>mesa/varray: set double arrays to non-normalised.</li>
+  <li>mesa/shader: return correct attribute location for double matrix arrays</li>
+  <li>glsl: pass stage into mark function</li>
+  <li>glsl/fp64: add helper for dual slot double detection.</li>
+  <li>glsl: fix count_attribute_slots to allow for different 64-bit handling</li>
+  <li>glsl: only update doubles inputs for vertex inputs.</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.1</li>
+  <li>cherry-ignore: drop the "re-enable" DCC on Stoney</li>
+  <li>cherry-ignore: don't pick a specific i965 formats patch</li>
+  <li>Update version to 11.1.1</li>
+</ul>
+
+<p>Eric Anholt (2):</p>
+<ul>
+  <li>vc4: Warn instead of abort()ing on exec ioctl failures.</li>
+  <li>vc4: Keep sample mask writes from being reordered after TLB writes</li>
+</ul>
+
+<p>Grazvydas Ignotas (1):</p>
+<ul>
+  <li>r600: fix constant buffer size programming</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>meta/generate_mipmap: Work-around GLES 1.x problem with GL_DRAW_FRAMEBUFFER</li>
+</ul>
+
+<p>Ilia Mirkin (9):</p>
+<ul>
+  <li>nv50/ir: can't have predication and immediates</li>
+  <li>gk104/ir: simplify and fool-proof texbar algorithm</li>
+  <li>glsl: assign varying locations to tess shaders when doing SSO</li>
+  <li>glx/dri3: a drawable might not be bound at wait time</li>
+  <li>nvc0: don't forget to reset VTX_TMP bufctx slot after blit completion</li>
+  <li>nv50/ir: float(s32 &amp; 0xff) = float(u8), not s8</li>
+  <li>nv50,nvc0: make sure there's pushbuf space and that we ref the bo early</li>
+  <li>nv50,nvc0: fix crash when increasing bsp bo size for h264</li>
+  <li>nvc0: scale up inter_bo size so that it's 16M for a 4K video</li>
+</ul>
+
+<p>Jonathan Gray (2):</p>
+<ul>
+  <li>configure.ac: use pkg-config for libelf</li>
+  <li>configure: check for python2.7 for PYTHON2</li>
+</ul>
+
+<p>Kenneth Graunke (5):</p>
+<ul>
+  <li>ralloc: Fix ralloc_adopt() to the old context's last child's parent.</li>
+  <li>drirc: Disable ARB_blend_func_extended for Heaven 4.0/Valley 1.0.</li>
+  <li>glsl: Fix varying struct locations when varying packing is disabled.</li>
+  <li>nvc0: Set winding order regardless of domain.</li>
+  <li>nir: Add a lower_fdiv option, turn fdiv into fmul/frcp.</li>
+</ul>
+
+<p>Marek Olšák (7):</p>
+<ul>
+  <li>tgsi/scan: add flag colors_written</li>
+  <li>r600g: write all MRTs only if there is exactly one output (fixes a hang)</li>
+  <li>radeonsi: don't call of u_prims_for_vertices for patches and rectangles</li>
+  <li>radeonsi: apply the streamout workaround to Fiji as well</li>
+  <li>gallium/radeon: fix Hyper-Z hangs by programming PA_SC_MODE_CNTL_1 correctly</li>
+  <li>program: add _mesa_reserve_parameter_storage</li>
+  <li>st/mesa: fix GLSL uniform updates for glBitmap &amp; glDrawPixels (v2)</li>
+</ul>
+
+<p>Mark Janes (1):</p>
+<ul>
+  <li>Add missing platform information for KBL</li>
+</ul>
+
+<p>Miklós Máté (1):</p>
+<ul>
+  <li>mesa: Don't leak ATIfs instructions in DeleteFragmentShader</li>
+</ul>
+
+<p>Neil Roberts (3):</p>
+<ul>
+  <li>i965: Add MESA_FORMAT_B8G8R8X8_SRGB to brw_format_for_mesa_format</li>
+  <li>i965: Add B8G8R8X8_SRGB to the alpha format override</li>
+  <li>i965: Fix crash when calling glViewport with no surface bound</li>
+</ul>
+
+<p>Nicolai Hähnle (2):</p>
+<ul>
+  <li>gallium/radeon: only dispose locally created target machine in radeon_llvm_compile</li>
+  <li>gallium/radeon: fix regression in a number of driver queries</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>configura.ac: fix test for SSE4.1 assembler support</li>
+</ul>
+
+<p>Patrick Rudolph (2):</p>
+<ul>
+  <li>nv50,nvc0: fix use-after-free when vertex buffers are unbound</li>
+  <li>gallium/util: return correct number of bound vertex buffers</li>
+</ul>
+
+<p>Rob Herring (1):</p>
+<ul>
+  <li>freedreno/ir3: fix 32-bit builds with pointer-to-int-cast error enabled</li>
+</ul>
+
+<p>Samuel Pitoiset (3):</p>
+<ul>
+  <li>nvc0: free memory allocated by the prog which reads MP perf counters</li>
+  <li>nv50,nvc0: free memory allocated by performance metrics</li>
+  <li>nv50: free memory allocated by the prog which reads MP perf counters</li>
+</ul>
+
+<p>Sarah Sharp (1):</p>
+<ul>
+  <li>mesa: Add KBL PCI IDs and platform information.</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/11.2.0.html b/docs/relnotes/11.2.0.html

index 12e0f07..616c134 100644 (file)
--- a/docs/relnotes/11.2.0.html
+++ b/docs/relnotes/11.2.0.html
@@ -47,7 +47,9 @@ Note: some of the new features are only available with certain drivers.
  <li>GL_ARB_base_instance on freedreno/a4xx</li>
  <li>GL_ARB_compute_shader on i965</li>
  <li>GL_ARB_copy_image on r600</li>
-<li>GL_ARB_tessellation_shader on r600 (evergreen/cayman only)</li>
+<li>GL_ARB_indirect_parameters on nvc0</li>
+<li>GL_ARB_shader_draw_parameters on i965, nvc0</li>
+<li>GL_ARB_tessellation_shader on i965 and r600 (evergreen/cayman only)</li>
  <li>GL_ARB_texture_buffer_object_rgb32 on freedreno/a4xx</li>
  <li>GL_ARB_texture_buffer_range on freedreno/a4xx</li>
  <li>GL_ARB_texture_query_lod on freedreno/a4xx</li>
@@ -56,6 +58,8 @@ Note: some of the new features are only available with certain drivers.
  <li>GL_ARB_vertex_type_10f_11f_11f_rev on freedreno/a4xx</li>
  <li>GL_KHR_texture_compression_astc_ldr on freedreno/a4xx</li>
  <li>GL_AMD_performance_monitor on radeonsi (CIK+ only)</li>
+<li>New OSMesaCreateContextAttribs() function (for creating core profile
+    contexts)</li>
  </ul>
  
  <h2>Bug fixes</h2>
diff --git a/docs/thanks.html b/docs/thanks.html

index 29a41e4..685cb31 100644 (file)
--- a/docs/thanks.html
+++ b/docs/thanks.html
@@ -42,9 +42,7 @@ Tungsten Graphics, Inc. have supported the ongoing development of Mesa.
  <li>The
  <a href="http://www.mesa3d.org">Mesa</a>
  website is hosted by
-<a href="http://sourceforge.net">
-<img src="http://sourceforge.net/sflogo.php?group_id=3&amp;type=1"
-width="88" height="31" align="bottom" alt="Sourceforge.net" border="0"></a>
+<a href="http://sourceforge.net">sourceforge.net</a>.
  <br>
  <br>
  
diff --git a/include/GL/osmesa.h b/include/GL/osmesa.h

index ca0d167..39cd54e 100644 (file)
--- a/include/GL/osmesa.h
+++ b/include/GL/osmesa.h
@@ -58,8 +58,8 @@ extern "C" {
  #include <GL/gl.h>
  
  
-#define OSMESA_MAJOR_VERSION 10
-#define OSMESA_MINOR_VERSION 0
+#define OSMESA_MAJOR_VERSION 11
+#define OSMESA_MINOR_VERSION 2
  #define OSMESA_PATCH_VERSION 0
  
  
@@ -95,6 +95,18 @@ extern "C" {
  #define OSMESA_MAX_WIDTH       0x24  /* new in 4.0 */
  #define OSMESA_MAX_HEIGHT      0x25  /* new in 4.0 */
  
+/*
+ * Accepted in OSMesaCreateContextAttrib's attribute list.
+ */
+#define OSMESA_DEPTH_BITS            0x30
+#define OSMESA_STENCIL_BITS          0x31
+#define OSMESA_ACCUM_BITS            0x32
+#define OSMESA_PROFILE               0x33
+#define OSMESA_CORE_PROFILE          0x34
+#define OSMESA_COMPAT_PROFILE        0x35
+#define OSMESA_CONTEXT_MAJOR_VERSION 0x36
+#define OSMESA_CONTEXT_MINOR_VERSION 0x37
+
  
  typedef struct osmesa_context *OSMesaContext;
  
@@ -128,6 +140,35 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits,
  
  
  /*
+ * Create an Off-Screen Mesa rendering context with attribute list.
+ * The list is composed of (attribute, value) pairs and terminated with
+ * attribute==0.  Supported Attributes:
+ *
+ * Attributes                    Values
+ * --------------------------------------------------------------------------
+ * OSMESA_FORMAT                 OSMESA_RGBA*, OSMESA_BGRA, OSMESA_ARGB, etc.
+ * OSMESA_DEPTH_BITS             0*, 16, 24, 32
+ * OSMESA_STENCIL_BITS           0*, 8
+ * OSMESA_ACCUM_BITS             0*, 16
+ * OSMESA_PROFILE                OSMESA_COMPAT_PROFILE*, OSMESA_CORE_PROFILE
+ * OSMESA_CONTEXT_MAJOR_VERSION  1*, 2, 3
+ * OSMESA_CONTEXT_MINOR_VERSION  0+
+ *
+ * Note: * = default value
+ *
+ * We return a context version >= what's specified by OSMESA_CONTEXT_MAJOR/
+ * MINOR_VERSION for the given profile.  For example, if you request a GL 1.4
+ * compat profile, you might get a GL 3.0 compat profile.
+ * Otherwise, null is returned if the version/profile is not supported.
+ *
+ * New in Mesa 11.2
+ */
+GLAPI OSMesaContext GLAPIENTRY
+OSMesaCreateContextAttribs( const int *attribList, OSMesaContext sharelist );
+
+
+
+/*
   * Destroy an Off-Screen Mesa rendering context.
   *
   * Input:  ctx - the context to destroy
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h

index 5891ba6..5139e27 100644 (file)
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -132,6 +132,28 @@ CHIPSET(0x1932, skl_gt4, "Intel(R) Skylake GT4")
  CHIPSET(0x193A, skl_gt4, "Intel(R) Skylake GT4")
  CHIPSET(0x193B, skl_gt4, "Intel(R) Skylake GT4")
  CHIPSET(0x193D, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x5902, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x5906, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590A, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590B, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590E, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x5913, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5915, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5917, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5912, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x5916, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591A, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591B, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591D, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591E, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x5921, kbl_gt2, "Intel(R) Kabylake GT2F")
+CHIPSET(0x5926, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x592A, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x592B, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x5932, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593A, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593D, kbl_gt4, "Intel(R) Kabylake GT4")
  CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
  CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
  CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
diff --git a/include/vulkan/vk_platform.h b/include/vulkan/vk_platform.h

new file mode 100644 (file)

index 0000000..a53e725
--- /dev/null
+++ b/include/vulkan/vk_platform.h
@@ -0,0 +1,127 @@
+//
+// File: vk_platform.h
+//
+/*
+** Copyright (c) 2014-2015 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+
+
+#ifndef __VK_PLATFORM_H__
+#define __VK_PLATFORM_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+/*
+***************************************************************************************************
+*   Platform-specific directives and type declarations
+***************************************************************************************************
+*/
+
+/* Platform-specific calling convention macros.
+ *
+ * Platforms should define these so that Vulkan clients call Vulkan commands
+ * with the same calling conventions that the Vulkan implementation expects.
+ *
+ * VKAPI_ATTR - Placed before the return type in function declarations.
+ *              Useful for C++11 and GCC/Clang-style function attribute syntax.
+ * VKAPI_CALL - Placed after the return type in function declarations.
+ *              Useful for MSVC-style calling convention syntax.
+ * VKAPI_PTR  - Placed between the '(' and '*' in function pointer types.
+ *
+ * Function declaration:  VKAPI_ATTR void VKAPI_CALL vkCommand(void);
+ * Function pointer type: typedef void (VKAPI_PTR *PFN_vkCommand)(void);
+ */
+#if defined(_WIN32)
+    // On Windows, Vulkan commands use the stdcall convention
+    #define VKAPI_ATTR
+    #define VKAPI_CALL __stdcall
+    #define VKAPI_PTR  VKAPI_CALL
+#elif defined(__ANDROID__) && defined(__ARM_EABI__) && !defined(__ARM_ARCH_7A__)
+    // Android does not support Vulkan in native code using the "armeabi" ABI.
+    #error "Vulkan requires the 'armeabi-v7a' or 'armeabi-v7a-hard' ABI on 32-bit ARM CPUs"
+#elif defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+    // On Android/ARMv7a, Vulkan functions use the armeabi-v7a-hard calling
+    // convention, even if the application's native code is compiled with the
+    // armeabi-v7a calling convention.
+    #define VKAPI_ATTR __attribute__((pcs("aapcs-vfp")))
+    #define VKAPI_CALL
+    #define VKAPI_PTR  VKAPI_ATTR
+#else
+    // On other platforms, use the default calling convention
+    #define VKAPI_ATTR
+    #define VKAPI_CALL
+    #define VKAPI_PTR
+#endif
+
+#include <stddef.h>
+
+#if !defined(VK_NO_STDINT_H)
+    #if defined(_MSC_VER) && (_MSC_VER < 1600)
+        typedef signed   __int8  int8_t;
+        typedef unsigned __int8  uint8_t;
+        typedef signed   __int16 int16_t;
+        typedef unsigned __int16 uint16_t;
+        typedef signed   __int32 int32_t;
+        typedef unsigned __int32 uint32_t;
+        typedef signed   __int64 int64_t;
+        typedef unsigned __int64 uint64_t;
+    #else
+        #include <stdint.h>
+    #endif
+#endif // !defined(VK_NO_STDINT_H)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+// Platform-specific headers required by platform window system extensions.
+// These are enabled prior to #including "vulkan.h". The same enable then
+// controls inclusion of the extension interfaces in vulkan.h.
+
+#ifdef VK_USE_PLATFORM_ANDROID_KHR
+#include <android/native_window.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_MIR_KHR
+#include <mir_toolkit/client_types.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+#include <wayland-client.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_WIN32_KHR
+#include <windows.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+#include <X11/Xlib.h>
+#endif
+
+#ifdef VK_USE_PLATFORM_XCB_KHR
+#include <xcb/xcb.h>
+#endif
+
+#endif // __VK_PLATFORM_H__
diff --git a/include/vulkan/vulkan.h b/include/vulkan/vulkan.h

new file mode 100644 (file)

index 0000000..c40e131
--- /dev/null
+++ b/include/vulkan/vulkan.h
@@ -0,0 +1,3670 @@
+#ifndef __vulkan_h_
+#define __vulkan_h_ 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** Copyright (c) 2015 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+
+/*
+** This header is generated from the Khronos Vulkan XML API Registry.
+**
+*/
+
+
+#define VK_VERSION_1_0 1
+#include "vk_platform.h"
+
+#define VK_MAKE_VERSION(major, minor, patch) \
+    ((major << 22) | (minor << 12) | patch)
+
+// Vulkan API version supported by this file
+#define VK_API_VERSION VK_MAKE_VERSION(1, 0, 1)
+
+
+#define VK_NULL_HANDLE 0
+        
+
+
+#define VK_DEFINE_HANDLE(object) typedef struct object##_T* object;
+
+
+#if defined(__LP64__) || defined(_WIN64) || defined(__x86_64__) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__)
+        #define VK_DEFINE_NON_DISPATCHABLE_HANDLE(object) typedef struct object##_T *object;
+#else
+        #define VK_DEFINE_NON_DISPATCHABLE_HANDLE(object) typedef uint64_t object;
+#endif
+        
+
+
+typedef uint32_t VkFlags;
+typedef uint32_t VkBool32;
+typedef uint64_t VkDeviceSize;
+typedef uint32_t VkSampleMask;
+
+VK_DEFINE_HANDLE(VkInstance)
+VK_DEFINE_HANDLE(VkPhysicalDevice)
+VK_DEFINE_HANDLE(VkDevice)
+VK_DEFINE_HANDLE(VkQueue)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSemaphore)
+VK_DEFINE_HANDLE(VkCommandBuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkFence)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDeviceMemory)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkBuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkImage)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkEvent)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkQueryPool)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkBufferView)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkImageView)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkShaderModule)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipelineCache)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipelineLayout)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkRenderPass)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkPipeline)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorSetLayout)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSampler)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorPool)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorSet)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkFramebuffer)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkCommandPool)
+
+#define VK_LOD_CLAMP_NONE                 1000.0f
+#define VK_REMAINING_MIP_LEVELS           (~0U)
+#define VK_REMAINING_ARRAY_LAYERS         (~0U)
+#define VK_WHOLE_SIZE                     (~0ULL)
+#define VK_ATTACHMENT_UNUSED              (~0U)
+#define VK_TRUE                           1
+#define VK_FALSE                          0
+#define VK_QUEUE_FAMILY_IGNORED           (~0U)
+#define VK_SUBPASS_EXTERNAL               (~0U)
+#define VK_MAX_PHYSICAL_DEVICE_NAME_SIZE  256
+#define VK_UUID_SIZE                      16
+#define VK_MAX_MEMORY_TYPES               32
+#define VK_MAX_MEMORY_HEAPS               16
+#define VK_MAX_EXTENSION_NAME_SIZE        256
+#define VK_MAX_DESCRIPTION_SIZE           256
+
+
+typedef enum VkPipelineCacheHeaderVersion {
+    VK_PIPELINE_CACHE_HEADER_VERSION_ONE = 1,
+    VK_PIPELINE_CACHE_HEADER_VERSION_BEGIN_RANGE = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
+    VK_PIPELINE_CACHE_HEADER_VERSION_END_RANGE = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
+    VK_PIPELINE_CACHE_HEADER_VERSION_RANGE_SIZE = (VK_PIPELINE_CACHE_HEADER_VERSION_ONE - VK_PIPELINE_CACHE_HEADER_VERSION_ONE + 1),
+    VK_PIPELINE_CACHE_HEADER_VERSION_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineCacheHeaderVersion;
+
+typedef enum VkResult {
+    VK_SUCCESS = 0,
+    VK_NOT_READY = 1,
+    VK_TIMEOUT = 2,
+    VK_EVENT_SET = 3,
+    VK_EVENT_RESET = 4,
+    VK_INCOMPLETE = 5,
+    VK_ERROR_OUT_OF_HOST_MEMORY = -1,
+    VK_ERROR_OUT_OF_DEVICE_MEMORY = -2,
+    VK_ERROR_INITIALIZATION_FAILED = -3,
+    VK_ERROR_DEVICE_LOST = -4,
+    VK_ERROR_MEMORY_MAP_FAILED = -5,
+    VK_ERROR_LAYER_NOT_PRESENT = -6,
+    VK_ERROR_EXTENSION_NOT_PRESENT = -7,
+    VK_ERROR_FEATURE_NOT_PRESENT = -8,
+    VK_ERROR_INCOMPATIBLE_DRIVER = -9,
+    VK_ERROR_TOO_MANY_OBJECTS = -10,
+    VK_ERROR_FORMAT_NOT_SUPPORTED = -11,
+    VK_ERROR_SURFACE_LOST_KHR = -1000000000,
+    VK_SUBOPTIMAL_KHR = 1000001003,
+    VK_ERROR_OUT_OF_DATE_KHR = -1000001004,
+    VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001,
+    VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000008000,
+    VK_ERROR_VALIDATION_FAILED_EXT = -1000011001,
+    VK_RESULT_BEGIN_RANGE = VK_ERROR_FORMAT_NOT_SUPPORTED,
+    VK_RESULT_END_RANGE = VK_INCOMPLETE,
+    VK_RESULT_RANGE_SIZE = (VK_INCOMPLETE - VK_ERROR_FORMAT_NOT_SUPPORTED + 1),
+    VK_RESULT_MAX_ENUM = 0x7FFFFFFF
+} VkResult;
+
+typedef enum VkStructureType {
+    VK_STRUCTURE_TYPE_APPLICATION_INFO = 0,
+    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO = 1,
+    VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO = 2,
+    VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO = 3,
+    VK_STRUCTURE_TYPE_SUBMIT_INFO = 4,
+    VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO = 5,
+    VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE = 6,
+    VK_STRUCTURE_TYPE_BIND_SPARSE_INFO = 7,
+    VK_STRUCTURE_TYPE_FENCE_CREATE_INFO = 8,
+    VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO = 9,
+    VK_STRUCTURE_TYPE_EVENT_CREATE_INFO = 10,
+    VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO = 11,
+    VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO = 12,
+    VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO = 13,
+    VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO = 14,
+    VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO = 15,
+    VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO = 16,
+    VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO = 17,
+    VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO = 18,
+    VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO = 19,
+    VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO = 20,
+    VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO = 21,
+    VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO = 22,
+    VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO = 23,
+    VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO = 24,
+    VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO = 25,
+    VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO = 26,
+    VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO = 27,
+    VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO = 28,
+    VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO = 29,
+    VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO = 30,
+    VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO = 31,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO = 32,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO = 33,
+    VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO = 34,
+    VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET = 35,
+    VK_STRUCTURE_TYPE_COPY_DESCRIPTOR_SET = 36,
+    VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO = 37,
+    VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO = 38,
+    VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO = 39,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO = 40,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO = 41,
+    VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO = 42,
+    VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO = 43,
+    VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER = 44,
+    VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER = 45,
+    VK_STRUCTURE_TYPE_MEMORY_BARRIER = 46,
+    VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO = 47,
+    VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO = 48,
+    VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR = 1000001000,
+    VK_STRUCTURE_TYPE_PRESENT_INFO_KHR = 1000001001,
+    VK_STRUCTURE_TYPE_DISPLAY_MODE_CREATE_INFO_KHR = 1000002000,
+    VK_STRUCTURE_TYPE_DISPLAY_SURFACE_CREATE_INFO_KHR = 1000002001,
+    VK_STRUCTURE_TYPE_DISPLAY_PRESENT_INFO_KHR = 1000003000,
+    VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR = 1000004000,
+    VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR = 1000005000,
+    VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR = 1000006000,
+    VK_STRUCTURE_TYPE_MIR_SURFACE_CREATE_INFO_KHR = 1000007000,
+    VK_STRUCTURE_TYPE_ANDROID_SURFACE_CREATE_INFO_KHR = 1000008000,
+    VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR = 1000009000,
+    VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = 1000011000,
+    VK_STRUCTURE_TYPE_BEGIN_RANGE = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+    VK_STRUCTURE_TYPE_END_RANGE = VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO,
+    VK_STRUCTURE_TYPE_RANGE_SIZE = (VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO - VK_STRUCTURE_TYPE_APPLICATION_INFO + 1),
+    VK_STRUCTURE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkStructureType;
+
+typedef enum VkSystemAllocationScope {
+    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND = 0,
+    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT = 1,
+    VK_SYSTEM_ALLOCATION_SCOPE_CACHE = 2,
+    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE = 3,
+    VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE = 4,
+    VK_SYSTEM_ALLOCATION_SCOPE_BEGIN_RANGE = VK_SYSTEM_ALLOCATION_SCOPE_COMMAND,
+    VK_SYSTEM_ALLOCATION_SCOPE_END_RANGE = VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE,
+    VK_SYSTEM_ALLOCATION_SCOPE_RANGE_SIZE = (VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND + 1),
+    VK_SYSTEM_ALLOCATION_SCOPE_MAX_ENUM = 0x7FFFFFFF
+} VkSystemAllocationScope;
+
+typedef enum VkInternalAllocationType {
+    VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE = 0,
+    VK_INTERNAL_ALLOCATION_TYPE_BEGIN_RANGE = VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE,
+    VK_INTERNAL_ALLOCATION_TYPE_END_RANGE = VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE,
+    VK_INTERNAL_ALLOCATION_TYPE_RANGE_SIZE = (VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE - VK_INTERNAL_ALLOCATION_TYPE_EXECUTABLE + 1),
+    VK_INTERNAL_ALLOCATION_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkInternalAllocationType;
+
+typedef enum VkFormat {
+    VK_FORMAT_UNDEFINED = 0,
+    VK_FORMAT_R4G4_UNORM_PACK8 = 1,
+    VK_FORMAT_R4G4B4A4_UNORM_PACK16 = 2,
+    VK_FORMAT_B4G4R4A4_UNORM_PACK16 = 3,
+    VK_FORMAT_R5G6B5_UNORM_PACK16 = 4,
+    VK_FORMAT_B5G6R5_UNORM_PACK16 = 5,
+    VK_FORMAT_R5G5B5A1_UNORM_PACK16 = 6,
+    VK_FORMAT_B5G5R5A1_UNORM_PACK16 = 7,
+    VK_FORMAT_A1R5G5B5_UNORM_PACK16 = 8,
+    VK_FORMAT_R8_UNORM = 9,
+    VK_FORMAT_R8_SNORM = 10,
+    VK_FORMAT_R8_USCALED = 11,
+    VK_FORMAT_R8_SSCALED = 12,
+    VK_FORMAT_R8_UINT = 13,
+    VK_FORMAT_R8_SINT = 14,
+    VK_FORMAT_R8_SRGB = 15,
+    VK_FORMAT_R8G8_UNORM = 16,
+    VK_FORMAT_R8G8_SNORM = 17,
+    VK_FORMAT_R8G8_USCALED = 18,
+    VK_FORMAT_R8G8_SSCALED = 19,
+    VK_FORMAT_R8G8_UINT = 20,
+    VK_FORMAT_R8G8_SINT = 21,
+    VK_FORMAT_R8G8_SRGB = 22,
+    VK_FORMAT_R8G8B8_UNORM = 23,
+    VK_FORMAT_R8G8B8_SNORM = 24,
+    VK_FORMAT_R8G8B8_USCALED = 25,
+    VK_FORMAT_R8G8B8_SSCALED = 26,
+    VK_FORMAT_R8G8B8_UINT = 27,
+    VK_FORMAT_R8G8B8_SINT = 28,
+    VK_FORMAT_R8G8B8_SRGB = 29,
+    VK_FORMAT_B8G8R8_UNORM = 30,
+    VK_FORMAT_B8G8R8_SNORM = 31,
+    VK_FORMAT_B8G8R8_USCALED = 32,
+    VK_FORMAT_B8G8R8_SSCALED = 33,
+    VK_FORMAT_B8G8R8_UINT = 34,
+    VK_FORMAT_B8G8R8_SINT = 35,
+    VK_FORMAT_B8G8R8_SRGB = 36,
+    VK_FORMAT_R8G8B8A8_UNORM = 37,
+    VK_FORMAT_R8G8B8A8_SNORM = 38,
+    VK_FORMAT_R8G8B8A8_USCALED = 39,
+    VK_FORMAT_R8G8B8A8_SSCALED = 40,
+    VK_FORMAT_R8G8B8A8_UINT = 41,
+    VK_FORMAT_R8G8B8A8_SINT = 42,
+    VK_FORMAT_R8G8B8A8_SRGB = 43,
+    VK_FORMAT_B8G8R8A8_UNORM = 44,
+    VK_FORMAT_B8G8R8A8_SNORM = 45,
+    VK_FORMAT_B8G8R8A8_USCALED = 46,
+    VK_FORMAT_B8G8R8A8_SSCALED = 47,
+    VK_FORMAT_B8G8R8A8_UINT = 48,
+    VK_FORMAT_B8G8R8A8_SINT = 49,
+    VK_FORMAT_B8G8R8A8_SRGB = 50,
+    VK_FORMAT_A8B8G8R8_UNORM_PACK32 = 51,
+    VK_FORMAT_A8B8G8R8_SNORM_PACK32 = 52,
+    VK_FORMAT_A8B8G8R8_USCALED_PACK32 = 53,
+    VK_FORMAT_A8B8G8R8_SSCALED_PACK32 = 54,
+    VK_FORMAT_A8B8G8R8_UINT_PACK32 = 55,
+    VK_FORMAT_A8B8G8R8_SINT_PACK32 = 56,
+    VK_FORMAT_A8B8G8R8_SRGB_PACK32 = 57,
+    VK_FORMAT_A2R10G10B10_UNORM_PACK32 = 58,
+    VK_FORMAT_A2R10G10B10_SNORM_PACK32 = 59,
+    VK_FORMAT_A2R10G10B10_USCALED_PACK32 = 60,
+    VK_FORMAT_A2R10G10B10_SSCALED_PACK32 = 61,
+    VK_FORMAT_A2R10G10B10_UINT_PACK32 = 62,
+    VK_FORMAT_A2R10G10B10_SINT_PACK32 = 63,
+    VK_FORMAT_A2B10G10R10_UNORM_PACK32 = 64,
+    VK_FORMAT_A2B10G10R10_SNORM_PACK32 = 65,
+    VK_FORMAT_A2B10G10R10_USCALED_PACK32 = 66,
+    VK_FORMAT_A2B10G10R10_SSCALED_PACK32 = 67,
+    VK_FORMAT_A2B10G10R10_UINT_PACK32 = 68,
+    VK_FORMAT_A2B10G10R10_SINT_PACK32 = 69,
+    VK_FORMAT_R16_UNORM = 70,
+    VK_FORMAT_R16_SNORM = 71,
+    VK_FORMAT_R16_USCALED = 72,
+    VK_FORMAT_R16_SSCALED = 73,
+    VK_FORMAT_R16_UINT = 74,
+    VK_FORMAT_R16_SINT = 75,
+    VK_FORMAT_R16_SFLOAT = 76,
+    VK_FORMAT_R16G16_UNORM = 77,
+    VK_FORMAT_R16G16_SNORM = 78,
+    VK_FORMAT_R16G16_USCALED = 79,
+    VK_FORMAT_R16G16_SSCALED = 80,
+    VK_FORMAT_R16G16_UINT = 81,
+    VK_FORMAT_R16G16_SINT = 82,
+    VK_FORMAT_R16G16_SFLOAT = 83,
+    VK_FORMAT_R16G16B16_UNORM = 84,
+    VK_FORMAT_R16G16B16_SNORM = 85,
+    VK_FORMAT_R16G16B16_USCALED = 86,
+    VK_FORMAT_R16G16B16_SSCALED = 87,
+    VK_FORMAT_R16G16B16_UINT = 88,
+    VK_FORMAT_R16G16B16_SINT = 89,
+    VK_FORMAT_R16G16B16_SFLOAT = 90,
+    VK_FORMAT_R16G16B16A16_UNORM = 91,
+    VK_FORMAT_R16G16B16A16_SNORM = 92,
+    VK_FORMAT_R16G16B16A16_USCALED = 93,
+    VK_FORMAT_R16G16B16A16_SSCALED = 94,
+    VK_FORMAT_R16G16B16A16_UINT = 95,
+    VK_FORMAT_R16G16B16A16_SINT = 96,
+    VK_FORMAT_R16G16B16A16_SFLOAT = 97,
+    VK_FORMAT_R32_UINT = 98,
+    VK_FORMAT_R32_SINT = 99,
+    VK_FORMAT_R32_SFLOAT = 100,
+    VK_FORMAT_R32G32_UINT = 101,
+    VK_FORMAT_R32G32_SINT = 102,
+    VK_FORMAT_R32G32_SFLOAT = 103,
+    VK_FORMAT_R32G32B32_UINT = 104,
+    VK_FORMAT_R32G32B32_SINT = 105,
+    VK_FORMAT_R32G32B32_SFLOAT = 106,
+    VK_FORMAT_R32G32B32A32_UINT = 107,
+    VK_FORMAT_R32G32B32A32_SINT = 108,
+    VK_FORMAT_R32G32B32A32_SFLOAT = 109,
+    VK_FORMAT_R64_UINT = 110,
+    VK_FORMAT_R64_SINT = 111,
+    VK_FORMAT_R64_SFLOAT = 112,
+    VK_FORMAT_R64G64_UINT = 113,
+    VK_FORMAT_R64G64_SINT = 114,
+    VK_FORMAT_R64G64_SFLOAT = 115,
+    VK_FORMAT_R64G64B64_UINT = 116,
+    VK_FORMAT_R64G64B64_SINT = 117,
+    VK_FORMAT_R64G64B64_SFLOAT = 118,
+    VK_FORMAT_R64G64B64A64_UINT = 119,
+    VK_FORMAT_R64G64B64A64_SINT = 120,
+    VK_FORMAT_R64G64B64A64_SFLOAT = 121,
+    VK_FORMAT_B10G11R11_UFLOAT_PACK32 = 122,
+    VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 = 123,
+    VK_FORMAT_D16_UNORM = 124,
+    VK_FORMAT_X8_D24_UNORM_PACK32 = 125,
+    VK_FORMAT_D32_SFLOAT = 126,
+    VK_FORMAT_S8_UINT = 127,
+    VK_FORMAT_D16_UNORM_S8_UINT = 128,
+    VK_FORMAT_D24_UNORM_S8_UINT = 129,
+    VK_FORMAT_D32_SFLOAT_S8_UINT = 130,
+    VK_FORMAT_BC1_RGB_UNORM_BLOCK = 131,
+    VK_FORMAT_BC1_RGB_SRGB_BLOCK = 132,
+    VK_FORMAT_BC1_RGBA_UNORM_BLOCK = 133,
+    VK_FORMAT_BC1_RGBA_SRGB_BLOCK = 134,
+    VK_FORMAT_BC2_UNORM_BLOCK = 135,
+    VK_FORMAT_BC2_SRGB_BLOCK = 136,
+    VK_FORMAT_BC3_UNORM_BLOCK = 137,
+    VK_FORMAT_BC3_SRGB_BLOCK = 138,
+    VK_FORMAT_BC4_UNORM_BLOCK = 139,
+    VK_FORMAT_BC4_SNORM_BLOCK = 140,
+    VK_FORMAT_BC5_UNORM_BLOCK = 141,
+    VK_FORMAT_BC5_SNORM_BLOCK = 142,
+    VK_FORMAT_BC6H_UFLOAT_BLOCK = 143,
+    VK_FORMAT_BC6H_SFLOAT_BLOCK = 144,
+    VK_FORMAT_BC7_UNORM_BLOCK = 145,
+    VK_FORMAT_BC7_SRGB_BLOCK = 146,
+    VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK = 147,
+    VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK = 148,
+    VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK = 149,
+    VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK = 150,
+    VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK = 151,
+    VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK = 152,
+    VK_FORMAT_EAC_R11_UNORM_BLOCK = 153,
+    VK_FORMAT_EAC_R11_SNORM_BLOCK = 154,
+    VK_FORMAT_EAC_R11G11_UNORM_BLOCK = 155,
+    VK_FORMAT_EAC_R11G11_SNORM_BLOCK = 156,
+    VK_FORMAT_ASTC_4x4_UNORM_BLOCK = 157,
+    VK_FORMAT_ASTC_4x4_SRGB_BLOCK = 158,
+    VK_FORMAT_ASTC_5x4_UNORM_BLOCK = 159,
+    VK_FORMAT_ASTC_5x4_SRGB_BLOCK = 160,
+    VK_FORMAT_ASTC_5x5_UNORM_BLOCK = 161,
+    VK_FORMAT_ASTC_5x5_SRGB_BLOCK = 162,
+    VK_FORMAT_ASTC_6x5_UNORM_BLOCK = 163,
+    VK_FORMAT_ASTC_6x5_SRGB_BLOCK = 164,
+    VK_FORMAT_ASTC_6x6_UNORM_BLOCK = 165,
+    VK_FORMAT_ASTC_6x6_SRGB_BLOCK = 166,
+    VK_FORMAT_ASTC_8x5_UNORM_BLOCK = 167,
+    VK_FORMAT_ASTC_8x5_SRGB_BLOCK = 168,
+    VK_FORMAT_ASTC_8x6_UNORM_BLOCK = 169,
+    VK_FORMAT_ASTC_8x6_SRGB_BLOCK = 170,
+    VK_FORMAT_ASTC_8x8_UNORM_BLOCK = 171,
+    VK_FORMAT_ASTC_8x8_SRGB_BLOCK = 172,
+    VK_FORMAT_ASTC_10x5_UNORM_BLOCK = 173,
+    VK_FORMAT_ASTC_10x5_SRGB_BLOCK = 174,
+    VK_FORMAT_ASTC_10x6_UNORM_BLOCK = 175,
+    VK_FORMAT_ASTC_10x6_SRGB_BLOCK = 176,
+    VK_FORMAT_ASTC_10x8_UNORM_BLOCK = 177,
+    VK_FORMAT_ASTC_10x8_SRGB_BLOCK = 178,
+    VK_FORMAT_ASTC_10x10_UNORM_BLOCK = 179,
+    VK_FORMAT_ASTC_10x10_SRGB_BLOCK = 180,
+    VK_FORMAT_ASTC_12x10_UNORM_BLOCK = 181,
+    VK_FORMAT_ASTC_12x10_SRGB_BLOCK = 182,
+    VK_FORMAT_ASTC_12x12_UNORM_BLOCK = 183,
+    VK_FORMAT_ASTC_12x12_SRGB_BLOCK = 184,
+    VK_FORMAT_BEGIN_RANGE = VK_FORMAT_UNDEFINED,
+    VK_FORMAT_END_RANGE = VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
+    VK_FORMAT_RANGE_SIZE = (VK_FORMAT_ASTC_12x12_SRGB_BLOCK - VK_FORMAT_UNDEFINED + 1),
+    VK_FORMAT_MAX_ENUM = 0x7FFFFFFF
+} VkFormat;
+
+typedef enum VkImageType {
+    VK_IMAGE_TYPE_1D = 0,
+    VK_IMAGE_TYPE_2D = 1,
+    VK_IMAGE_TYPE_3D = 2,
+    VK_IMAGE_TYPE_BEGIN_RANGE = VK_IMAGE_TYPE_1D,
+    VK_IMAGE_TYPE_END_RANGE = VK_IMAGE_TYPE_3D,
+    VK_IMAGE_TYPE_RANGE_SIZE = (VK_IMAGE_TYPE_3D - VK_IMAGE_TYPE_1D + 1),
+    VK_IMAGE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkImageType;
+
+typedef enum VkImageTiling {
+    VK_IMAGE_TILING_OPTIMAL = 0,
+    VK_IMAGE_TILING_LINEAR = 1,
+    VK_IMAGE_TILING_BEGIN_RANGE = VK_IMAGE_TILING_OPTIMAL,
+    VK_IMAGE_TILING_END_RANGE = VK_IMAGE_TILING_LINEAR,
+    VK_IMAGE_TILING_RANGE_SIZE = (VK_IMAGE_TILING_LINEAR - VK_IMAGE_TILING_OPTIMAL + 1),
+    VK_IMAGE_TILING_MAX_ENUM = 0x7FFFFFFF
+} VkImageTiling;
+
+typedef enum VkPhysicalDeviceType {
+    VK_PHYSICAL_DEVICE_TYPE_OTHER = 0,
+    VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU = 1,
+    VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU = 2,
+    VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU = 3,
+    VK_PHYSICAL_DEVICE_TYPE_CPU = 4,
+    VK_PHYSICAL_DEVICE_TYPE_BEGIN_RANGE = VK_PHYSICAL_DEVICE_TYPE_OTHER,
+    VK_PHYSICAL_DEVICE_TYPE_END_RANGE = VK_PHYSICAL_DEVICE_TYPE_CPU,
+    VK_PHYSICAL_DEVICE_TYPE_RANGE_SIZE = (VK_PHYSICAL_DEVICE_TYPE_CPU - VK_PHYSICAL_DEVICE_TYPE_OTHER + 1),
+    VK_PHYSICAL_DEVICE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkPhysicalDeviceType;
+
+typedef enum VkQueryType {
+    VK_QUERY_TYPE_OCCLUSION = 0,
+    VK_QUERY_TYPE_PIPELINE_STATISTICS = 1,
+    VK_QUERY_TYPE_TIMESTAMP = 2,
+    VK_QUERY_TYPE_BEGIN_RANGE = VK_QUERY_TYPE_OCCLUSION,
+    VK_QUERY_TYPE_END_RANGE = VK_QUERY_TYPE_TIMESTAMP,
+    VK_QUERY_TYPE_RANGE_SIZE = (VK_QUERY_TYPE_TIMESTAMP - VK_QUERY_TYPE_OCCLUSION + 1),
+    VK_QUERY_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkQueryType;
+
+typedef enum VkSharingMode {
+    VK_SHARING_MODE_EXCLUSIVE = 0,
+    VK_SHARING_MODE_CONCURRENT = 1,
+    VK_SHARING_MODE_BEGIN_RANGE = VK_SHARING_MODE_EXCLUSIVE,
+    VK_SHARING_MODE_END_RANGE = VK_SHARING_MODE_CONCURRENT,
+    VK_SHARING_MODE_RANGE_SIZE = (VK_SHARING_MODE_CONCURRENT - VK_SHARING_MODE_EXCLUSIVE + 1),
+    VK_SHARING_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSharingMode;
+
+typedef enum VkImageLayout {
+    VK_IMAGE_LAYOUT_UNDEFINED = 0,
+    VK_IMAGE_LAYOUT_GENERAL = 1,
+    VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL = 2,
+    VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL = 3,
+    VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL = 4,
+    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL = 5,
+    VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL = 6,
+    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL = 7,
+    VK_IMAGE_LAYOUT_PREINITIALIZED = 8,
+    VK_IMAGE_LAYOUT_PRESENT_SRC_KHR = 1000001002,
+    VK_IMAGE_LAYOUT_BEGIN_RANGE = VK_IMAGE_LAYOUT_UNDEFINED,
+    VK_IMAGE_LAYOUT_END_RANGE = VK_IMAGE_LAYOUT_PREINITIALIZED,
+    VK_IMAGE_LAYOUT_RANGE_SIZE = (VK_IMAGE_LAYOUT_PREINITIALIZED - VK_IMAGE_LAYOUT_UNDEFINED + 1),
+    VK_IMAGE_LAYOUT_MAX_ENUM = 0x7FFFFFFF
+} VkImageLayout;
+
+typedef enum VkImageViewType {
+    VK_IMAGE_VIEW_TYPE_1D = 0,
+    VK_IMAGE_VIEW_TYPE_2D = 1,
+    VK_IMAGE_VIEW_TYPE_3D = 2,
+    VK_IMAGE_VIEW_TYPE_CUBE = 3,
+    VK_IMAGE_VIEW_TYPE_1D_ARRAY = 4,
+    VK_IMAGE_VIEW_TYPE_2D_ARRAY = 5,
+    VK_IMAGE_VIEW_TYPE_CUBE_ARRAY = 6,
+    VK_IMAGE_VIEW_TYPE_BEGIN_RANGE = VK_IMAGE_VIEW_TYPE_1D,
+    VK_IMAGE_VIEW_TYPE_END_RANGE = VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
+    VK_IMAGE_VIEW_TYPE_RANGE_SIZE = (VK_IMAGE_VIEW_TYPE_CUBE_ARRAY - VK_IMAGE_VIEW_TYPE_1D + 1),
+    VK_IMAGE_VIEW_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkImageViewType;
+
+typedef enum VkComponentSwizzle {
+    VK_COMPONENT_SWIZZLE_IDENTITY = 0,
+    VK_COMPONENT_SWIZZLE_ZERO = 1,
+    VK_COMPONENT_SWIZZLE_ONE = 2,
+    VK_COMPONENT_SWIZZLE_R = 3,
+    VK_COMPONENT_SWIZZLE_G = 4,
+    VK_COMPONENT_SWIZZLE_B = 5,
+    VK_COMPONENT_SWIZZLE_A = 6,
+    VK_COMPONENT_SWIZZLE_BEGIN_RANGE = VK_COMPONENT_SWIZZLE_IDENTITY,
+    VK_COMPONENT_SWIZZLE_END_RANGE = VK_COMPONENT_SWIZZLE_A,
+    VK_COMPONENT_SWIZZLE_RANGE_SIZE = (VK_COMPONENT_SWIZZLE_A - VK_COMPONENT_SWIZZLE_IDENTITY + 1),
+    VK_COMPONENT_SWIZZLE_MAX_ENUM = 0x7FFFFFFF
+} VkComponentSwizzle;
+
+typedef enum VkVertexInputRate {
+    VK_VERTEX_INPUT_RATE_VERTEX = 0,
+    VK_VERTEX_INPUT_RATE_INSTANCE = 1,
+    VK_VERTEX_INPUT_RATE_BEGIN_RANGE = VK_VERTEX_INPUT_RATE_VERTEX,
+    VK_VERTEX_INPUT_RATE_END_RANGE = VK_VERTEX_INPUT_RATE_INSTANCE,
+    VK_VERTEX_INPUT_RATE_RANGE_SIZE = (VK_VERTEX_INPUT_RATE_INSTANCE - VK_VERTEX_INPUT_RATE_VERTEX + 1),
+    VK_VERTEX_INPUT_RATE_MAX_ENUM = 0x7FFFFFFF
+} VkVertexInputRate;
+
+typedef enum VkPrimitiveTopology {
+    VK_PRIMITIVE_TOPOLOGY_POINT_LIST = 0,
+    VK_PRIMITIVE_TOPOLOGY_LINE_LIST = 1,
+    VK_PRIMITIVE_TOPOLOGY_LINE_STRIP = 2,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST = 3,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP = 4,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN = 5,
+    VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY = 6,
+    VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY = 7,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY = 8,
+    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY = 9,
+    VK_PRIMITIVE_TOPOLOGY_PATCH_LIST = 10,
+    VK_PRIMITIVE_TOPOLOGY_BEGIN_RANGE = VK_PRIMITIVE_TOPOLOGY_POINT_LIST,
+    VK_PRIMITIVE_TOPOLOGY_END_RANGE = VK_PRIMITIVE_TOPOLOGY_PATCH_LIST,
+    VK_PRIMITIVE_TOPOLOGY_RANGE_SIZE = (VK_PRIMITIVE_TOPOLOGY_PATCH_LIST - VK_PRIMITIVE_TOPOLOGY_POINT_LIST + 1),
+    VK_PRIMITIVE_TOPOLOGY_MAX_ENUM = 0x7FFFFFFF
+} VkPrimitiveTopology;
+
+typedef enum VkPolygonMode {
+    VK_POLYGON_MODE_FILL = 0,
+    VK_POLYGON_MODE_LINE = 1,
+    VK_POLYGON_MODE_POINT = 2,
+    VK_POLYGON_MODE_BEGIN_RANGE = VK_POLYGON_MODE_FILL,
+    VK_POLYGON_MODE_END_RANGE = VK_POLYGON_MODE_POINT,
+    VK_POLYGON_MODE_RANGE_SIZE = (VK_POLYGON_MODE_POINT - VK_POLYGON_MODE_FILL + 1),
+    VK_POLYGON_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkPolygonMode;
+
+typedef enum VkFrontFace {
+    VK_FRONT_FACE_COUNTER_CLOCKWISE = 0,
+    VK_FRONT_FACE_CLOCKWISE = 1,
+    VK_FRONT_FACE_BEGIN_RANGE = VK_FRONT_FACE_COUNTER_CLOCKWISE,
+    VK_FRONT_FACE_END_RANGE = VK_FRONT_FACE_CLOCKWISE,
+    VK_FRONT_FACE_RANGE_SIZE = (VK_FRONT_FACE_CLOCKWISE - VK_FRONT_FACE_COUNTER_CLOCKWISE + 1),
+    VK_FRONT_FACE_MAX_ENUM = 0x7FFFFFFF
+} VkFrontFace;
+
+typedef enum VkCompareOp {
+    VK_COMPARE_OP_NEVER = 0,
+    VK_COMPARE_OP_LESS = 1,
+    VK_COMPARE_OP_EQUAL = 2,
+    VK_COMPARE_OP_LESS_OR_EQUAL = 3,
+    VK_COMPARE_OP_GREATER = 4,
+    VK_COMPARE_OP_NOT_EQUAL = 5,
+    VK_COMPARE_OP_GREATER_OR_EQUAL = 6,
+    VK_COMPARE_OP_ALWAYS = 7,
+    VK_COMPARE_OP_BEGIN_RANGE = VK_COMPARE_OP_NEVER,
+    VK_COMPARE_OP_END_RANGE = VK_COMPARE_OP_ALWAYS,
+    VK_COMPARE_OP_RANGE_SIZE = (VK_COMPARE_OP_ALWAYS - VK_COMPARE_OP_NEVER + 1),
+    VK_COMPARE_OP_MAX_ENUM = 0x7FFFFFFF
+} VkCompareOp;
+
+typedef enum VkStencilOp {
+    VK_STENCIL_OP_KEEP = 0,
+    VK_STENCIL_OP_ZERO = 1,
+    VK_STENCIL_OP_REPLACE = 2,
+    VK_STENCIL_OP_INCREMENT_AND_CLAMP = 3,
+    VK_STENCIL_OP_DECREMENT_AND_CLAMP = 4,
+    VK_STENCIL_OP_INVERT = 5,
+    VK_STENCIL_OP_INCREMENT_AND_WRAP = 6,
+    VK_STENCIL_OP_DECREMENT_AND_WRAP = 7,
+    VK_STENCIL_OP_BEGIN_RANGE = VK_STENCIL_OP_KEEP,
+    VK_STENCIL_OP_END_RANGE = VK_STENCIL_OP_DECREMENT_AND_WRAP,
+    VK_STENCIL_OP_RANGE_SIZE = (VK_STENCIL_OP_DECREMENT_AND_WRAP - VK_STENCIL_OP_KEEP + 1),
+    VK_STENCIL_OP_MAX_ENUM = 0x7FFFFFFF
+} VkStencilOp;
+
+typedef enum VkLogicOp {
+    VK_LOGIC_OP_CLEAR = 0,
+    VK_LOGIC_OP_AND = 1,
+    VK_LOGIC_OP_AND_REVERSE = 2,
+    VK_LOGIC_OP_COPY = 3,
+    VK_LOGIC_OP_AND_INVERTED = 4,
+    VK_LOGIC_OP_NO_OP = 5,
+    VK_LOGIC_OP_XOR = 6,
+    VK_LOGIC_OP_OR = 7,
+    VK_LOGIC_OP_NOR = 8,
+    VK_LOGIC_OP_EQUIVALENT = 9,
+    VK_LOGIC_OP_INVERT = 10,
+    VK_LOGIC_OP_OR_REVERSE = 11,
+    VK_LOGIC_OP_COPY_INVERTED = 12,
+    VK_LOGIC_OP_OR_INVERTED = 13,
+    VK_LOGIC_OP_NAND = 14,
+    VK_LOGIC_OP_SET = 15,
+    VK_LOGIC_OP_BEGIN_RANGE = VK_LOGIC_OP_CLEAR,
+    VK_LOGIC_OP_END_RANGE = VK_LOGIC_OP_SET,
+    VK_LOGIC_OP_RANGE_SIZE = (VK_LOGIC_OP_SET - VK_LOGIC_OP_CLEAR + 1),
+    VK_LOGIC_OP_MAX_ENUM = 0x7FFFFFFF
+} VkLogicOp;
+
+typedef enum VkBlendFactor {
+    VK_BLEND_FACTOR_ZERO = 0,
+    VK_BLEND_FACTOR_ONE = 1,
+    VK_BLEND_FACTOR_SRC_COLOR = 2,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR = 3,
+    VK_BLEND_FACTOR_DST_COLOR = 4,
+    VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR = 5,
+    VK_BLEND_FACTOR_SRC_ALPHA = 6,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA = 7,
+    VK_BLEND_FACTOR_DST_ALPHA = 8,
+    VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA = 9,
+    VK_BLEND_FACTOR_CONSTANT_COLOR = 10,
+    VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR = 11,
+    VK_BLEND_FACTOR_CONSTANT_ALPHA = 12,
+    VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA = 13,
+    VK_BLEND_FACTOR_SRC_ALPHA_SATURATE = 14,
+    VK_BLEND_FACTOR_SRC1_COLOR = 15,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR = 16,
+    VK_BLEND_FACTOR_SRC1_ALPHA = 17,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA = 18,
+    VK_BLEND_FACTOR_BEGIN_RANGE = VK_BLEND_FACTOR_ZERO,
+    VK_BLEND_FACTOR_END_RANGE = VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA,
+    VK_BLEND_FACTOR_RANGE_SIZE = (VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA - VK_BLEND_FACTOR_ZERO + 1),
+    VK_BLEND_FACTOR_MAX_ENUM = 0x7FFFFFFF
+} VkBlendFactor;
+
+typedef enum VkBlendOp {
+    VK_BLEND_OP_ADD = 0,
+    VK_BLEND_OP_SUBTRACT = 1,
+    VK_BLEND_OP_REVERSE_SUBTRACT = 2,
+    VK_BLEND_OP_MIN = 3,
+    VK_BLEND_OP_MAX = 4,
+    VK_BLEND_OP_BEGIN_RANGE = VK_BLEND_OP_ADD,
+    VK_BLEND_OP_END_RANGE = VK_BLEND_OP_MAX,
+    VK_BLEND_OP_RANGE_SIZE = (VK_BLEND_OP_MAX - VK_BLEND_OP_ADD + 1),
+    VK_BLEND_OP_MAX_ENUM = 0x7FFFFFFF
+} VkBlendOp;
+
+typedef enum VkDynamicState {
+    VK_DYNAMIC_STATE_VIEWPORT = 0,
+    VK_DYNAMIC_STATE_SCISSOR = 1,
+    VK_DYNAMIC_STATE_LINE_WIDTH = 2,
+    VK_DYNAMIC_STATE_DEPTH_BIAS = 3,
+    VK_DYNAMIC_STATE_BLEND_CONSTANTS = 4,
+    VK_DYNAMIC_STATE_DEPTH_BOUNDS = 5,
+    VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK = 6,
+    VK_DYNAMIC_STATE_STENCIL_WRITE_MASK = 7,
+    VK_DYNAMIC_STATE_STENCIL_REFERENCE = 8,
+    VK_DYNAMIC_STATE_BEGIN_RANGE = VK_DYNAMIC_STATE_VIEWPORT,
+    VK_DYNAMIC_STATE_END_RANGE = VK_DYNAMIC_STATE_STENCIL_REFERENCE,
+    VK_DYNAMIC_STATE_RANGE_SIZE = (VK_DYNAMIC_STATE_STENCIL_REFERENCE - VK_DYNAMIC_STATE_VIEWPORT + 1),
+    VK_DYNAMIC_STATE_MAX_ENUM = 0x7FFFFFFF
+} VkDynamicState;
+
+typedef enum VkFilter {
+    VK_FILTER_NEAREST = 0,
+    VK_FILTER_LINEAR = 1,
+    VK_FILTER_BEGIN_RANGE = VK_FILTER_NEAREST,
+    VK_FILTER_END_RANGE = VK_FILTER_LINEAR,
+    VK_FILTER_RANGE_SIZE = (VK_FILTER_LINEAR - VK_FILTER_NEAREST + 1),
+    VK_FILTER_MAX_ENUM = 0x7FFFFFFF
+} VkFilter;
+
+typedef enum VkSamplerMipmapMode {
+    VK_SAMPLER_MIPMAP_MODE_NEAREST = 0,
+    VK_SAMPLER_MIPMAP_MODE_LINEAR = 1,
+    VK_SAMPLER_MIPMAP_MODE_BEGIN_RANGE = VK_SAMPLER_MIPMAP_MODE_NEAREST,
+    VK_SAMPLER_MIPMAP_MODE_END_RANGE = VK_SAMPLER_MIPMAP_MODE_LINEAR,
+    VK_SAMPLER_MIPMAP_MODE_RANGE_SIZE = (VK_SAMPLER_MIPMAP_MODE_LINEAR - VK_SAMPLER_MIPMAP_MODE_NEAREST + 1),
+    VK_SAMPLER_MIPMAP_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSamplerMipmapMode;
+
+typedef enum VkSamplerAddressMode {
+    VK_SAMPLER_ADDRESS_MODE_REPEAT = 0,
+    VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT = 1,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE = 2,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3,
+    VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE = 4,
+    VK_SAMPLER_ADDRESS_MODE_BEGIN_RANGE = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+    VK_SAMPLER_ADDRESS_MODE_END_RANGE = VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE,
+    VK_SAMPLER_ADDRESS_MODE_RANGE_SIZE = (VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE - VK_SAMPLER_ADDRESS_MODE_REPEAT + 1),
+    VK_SAMPLER_ADDRESS_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkSamplerAddressMode;
+
+typedef enum VkBorderColor {
+    VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK = 0,
+    VK_BORDER_COLOR_INT_TRANSPARENT_BLACK = 1,
+    VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK = 2,
+    VK_BORDER_COLOR_INT_OPAQUE_BLACK = 3,
+    VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE = 4,
+    VK_BORDER_COLOR_INT_OPAQUE_WHITE = 5,
+    VK_BORDER_COLOR_BEGIN_RANGE = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
+    VK_BORDER_COLOR_END_RANGE = VK_BORDER_COLOR_INT_OPAQUE_WHITE,
+    VK_BORDER_COLOR_RANGE_SIZE = (VK_BORDER_COLOR_INT_OPAQUE_WHITE - VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK + 1),
+    VK_BORDER_COLOR_MAX_ENUM = 0x7FFFFFFF
+} VkBorderColor;
+
+typedef enum VkDescriptorType {
+    VK_DESCRIPTOR_TYPE_SAMPLER = 0,
+    VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER = 1,
+    VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE = 2,
+    VK_DESCRIPTOR_TYPE_STORAGE_IMAGE = 3,
+    VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER = 4,
+    VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER = 5,
+    VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER = 6,
+    VK_DESCRIPTOR_TYPE_STORAGE_BUFFER = 7,
+    VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC = 8,
+    VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC = 9,
+    VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT = 10,
+    VK_DESCRIPTOR_TYPE_BEGIN_RANGE = VK_DESCRIPTOR_TYPE_SAMPLER,
+    VK_DESCRIPTOR_TYPE_END_RANGE = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT,
+    VK_DESCRIPTOR_TYPE_RANGE_SIZE = (VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT - VK_DESCRIPTOR_TYPE_SAMPLER + 1),
+    VK_DESCRIPTOR_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkDescriptorType;
+
+typedef enum VkAttachmentLoadOp {
+    VK_ATTACHMENT_LOAD_OP_LOAD = 0,
+    VK_ATTACHMENT_LOAD_OP_CLEAR = 1,
+    VK_ATTACHMENT_LOAD_OP_DONT_CARE = 2,
+    VK_ATTACHMENT_LOAD_OP_BEGIN_RANGE = VK_ATTACHMENT_LOAD_OP_LOAD,
+    VK_ATTACHMENT_LOAD_OP_END_RANGE = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+    VK_ATTACHMENT_LOAD_OP_RANGE_SIZE = (VK_ATTACHMENT_LOAD_OP_DONT_CARE - VK_ATTACHMENT_LOAD_OP_LOAD + 1),
+    VK_ATTACHMENT_LOAD_OP_MAX_ENUM = 0x7FFFFFFF
+} VkAttachmentLoadOp;
+
+typedef enum VkAttachmentStoreOp {
+    VK_ATTACHMENT_STORE_OP_STORE = 0,
+    VK_ATTACHMENT_STORE_OP_DONT_CARE = 1,
+    VK_ATTACHMENT_STORE_OP_BEGIN_RANGE = VK_ATTACHMENT_STORE_OP_STORE,
+    VK_ATTACHMENT_STORE_OP_END_RANGE = VK_ATTACHMENT_STORE_OP_DONT_CARE,
+    VK_ATTACHMENT_STORE_OP_RANGE_SIZE = (VK_ATTACHMENT_STORE_OP_DONT_CARE - VK_ATTACHMENT_STORE_OP_STORE + 1),
+    VK_ATTACHMENT_STORE_OP_MAX_ENUM = 0x7FFFFFFF
+} VkAttachmentStoreOp;
+
+typedef enum VkPipelineBindPoint {
+    VK_PIPELINE_BIND_POINT_GRAPHICS = 0,
+    VK_PIPELINE_BIND_POINT_COMPUTE = 1,
+    VK_PIPELINE_BIND_POINT_BEGIN_RANGE = VK_PIPELINE_BIND_POINT_GRAPHICS,
+    VK_PIPELINE_BIND_POINT_END_RANGE = VK_PIPELINE_BIND_POINT_COMPUTE,
+    VK_PIPELINE_BIND_POINT_RANGE_SIZE = (VK_PIPELINE_BIND_POINT_COMPUTE - VK_PIPELINE_BIND_POINT_GRAPHICS + 1),
+    VK_PIPELINE_BIND_POINT_MAX_ENUM = 0x7FFFFFFF
+} VkPipelineBindPoint;
+
+typedef enum VkCommandBufferLevel {
+    VK_COMMAND_BUFFER_LEVEL_PRIMARY = 0,
+    VK_COMMAND_BUFFER_LEVEL_SECONDARY = 1,
+    VK_COMMAND_BUFFER_LEVEL_BEGIN_RANGE = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+    VK_COMMAND_BUFFER_LEVEL_END_RANGE = VK_COMMAND_BUFFER_LEVEL_SECONDARY,
+    VK_COMMAND_BUFFER_LEVEL_RANGE_SIZE = (VK_COMMAND_BUFFER_LEVEL_SECONDARY - VK_COMMAND_BUFFER_LEVEL_PRIMARY + 1),
+    VK_COMMAND_BUFFER_LEVEL_MAX_ENUM = 0x7FFFFFFF
+} VkCommandBufferLevel;
+
+typedef enum VkIndexType {
+    VK_INDEX_TYPE_UINT16 = 0,
+    VK_INDEX_TYPE_UINT32 = 1,
+    VK_INDEX_TYPE_BEGIN_RANGE = VK_INDEX_TYPE_UINT16,
+    VK_INDEX_TYPE_END_RANGE = VK_INDEX_TYPE_UINT32,
+    VK_INDEX_TYPE_RANGE_SIZE = (VK_INDEX_TYPE_UINT32 - VK_INDEX_TYPE_UINT16 + 1),
+    VK_INDEX_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkIndexType;
+
+typedef enum VkSubpassContents {
+    VK_SUBPASS_CONTENTS_INLINE = 0,
+    VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS = 1,
+    VK_SUBPASS_CONTENTS_BEGIN_RANGE = VK_SUBPASS_CONTENTS_INLINE,
+    VK_SUBPASS_CONTENTS_END_RANGE = VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS,
+    VK_SUBPASS_CONTENTS_RANGE_SIZE = (VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS - VK_SUBPASS_CONTENTS_INLINE + 1),
+    VK_SUBPASS_CONTENTS_MAX_ENUM = 0x7FFFFFFF
+} VkSubpassContents;
+
+typedef VkFlags VkInstanceCreateFlags;
+
+typedef enum VkFormatFeatureFlagBits {
+    VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT = 0x00000001,
+    VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT = 0x00000002,
+    VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT = 0x00000004,
+    VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000008,
+    VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT = 0x00000010,
+    VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT = 0x00000020,
+    VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT = 0x00000040,
+    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT = 0x00000080,
+    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT = 0x00000100,
+    VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT = 0x00000200,
+    VK_FORMAT_FEATURE_BLIT_SRC_BIT = 0x00000400,
+    VK_FORMAT_FEATURE_BLIT_DST_BIT = 0x00000800,
+} VkFormatFeatureFlagBits;
+typedef VkFlags VkFormatFeatureFlags;
+
+typedef enum VkImageUsageFlagBits {
+    VK_IMAGE_USAGE_TRANSFER_SRC_BIT = 0x00000001,
+    VK_IMAGE_USAGE_TRANSFER_DST_BIT = 0x00000002,
+    VK_IMAGE_USAGE_SAMPLED_BIT = 0x00000004,
+    VK_IMAGE_USAGE_STORAGE_BIT = 0x00000008,
+    VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT = 0x00000010,
+    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT = 0x00000020,
+    VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT = 0x00000040,
+    VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT = 0x00000080,
+} VkImageUsageFlagBits;
+typedef VkFlags VkImageUsageFlags;
+
+typedef enum VkImageCreateFlagBits {
+    VK_IMAGE_CREATE_SPARSE_BINDING_BIT = 0x00000001,
+    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002,
+    VK_IMAGE_CREATE_SPARSE_ALIASED_BIT = 0x00000004,
+    VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT = 0x00000008,
+    VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT = 0x00000010,
+} VkImageCreateFlagBits;
+typedef VkFlags VkImageCreateFlags;
+
+typedef enum VkSampleCountFlagBits {
+    VK_SAMPLE_COUNT_1_BIT = 0x00000001,
+    VK_SAMPLE_COUNT_2_BIT = 0x00000002,
+    VK_SAMPLE_COUNT_4_BIT = 0x00000004,
+    VK_SAMPLE_COUNT_8_BIT = 0x00000008,
+    VK_SAMPLE_COUNT_16_BIT = 0x00000010,
+    VK_SAMPLE_COUNT_32_BIT = 0x00000020,
+    VK_SAMPLE_COUNT_64_BIT = 0x00000040,
+} VkSampleCountFlagBits;
+typedef VkFlags VkSampleCountFlags;
+
+typedef enum VkQueueFlagBits {
+    VK_QUEUE_GRAPHICS_BIT = 0x00000001,
+    VK_QUEUE_COMPUTE_BIT = 0x00000002,
+    VK_QUEUE_TRANSFER_BIT = 0x00000004,
+    VK_QUEUE_SPARSE_BINDING_BIT = 0x00000008,
+} VkQueueFlagBits;
+typedef VkFlags VkQueueFlags;
+
+typedef enum VkMemoryPropertyFlagBits {
+    VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT = 0x00000001,
+    VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT = 0x00000002,
+    VK_MEMORY_PROPERTY_HOST_COHERENT_BIT = 0x00000004,
+    VK_MEMORY_PROPERTY_HOST_CACHED_BIT = 0x00000008,
+    VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT = 0x00000010,
+} VkMemoryPropertyFlagBits;
+typedef VkFlags VkMemoryPropertyFlags;
+
+typedef enum VkMemoryHeapFlagBits {
+    VK_MEMORY_HEAP_DEVICE_LOCAL_BIT = 0x00000001,
+} VkMemoryHeapFlagBits;
+typedef VkFlags VkMemoryHeapFlags;
+typedef VkFlags VkDeviceCreateFlags;
+typedef VkFlags VkDeviceQueueCreateFlags;
+
+typedef enum VkPipelineStageFlagBits {
+    VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT = 0x00000001,
+    VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT = 0x00000002,
+    VK_PIPELINE_STAGE_VERTEX_INPUT_BIT = 0x00000004,
+    VK_PIPELINE_STAGE_VERTEX_SHADER_BIT = 0x00000008,
+    VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT = 0x00000010,
+    VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT = 0x00000020,
+    VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT = 0x00000040,
+    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT = 0x00000080,
+    VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT = 0x00000100,
+    VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT = 0x00000200,
+    VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT = 0x00000400,
+    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT = 0x00000800,
+    VK_PIPELINE_STAGE_TRANSFER_BIT = 0x00001000,
+    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT = 0x00002000,
+    VK_PIPELINE_STAGE_HOST_BIT = 0x00004000,
+    VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT = 0x00008000,
+    VK_PIPELINE_STAGE_ALL_COMMANDS_BIT = 0x00010000,
+} VkPipelineStageFlagBits;
+typedef VkFlags VkPipelineStageFlags;
+typedef VkFlags VkMemoryMapFlags;
+
+typedef enum VkImageAspectFlagBits {
+    VK_IMAGE_ASPECT_COLOR_BIT = 0x00000001,
+    VK_IMAGE_ASPECT_DEPTH_BIT = 0x00000002,
+    VK_IMAGE_ASPECT_STENCIL_BIT = 0x00000004,
+    VK_IMAGE_ASPECT_METADATA_BIT = 0x00000008,
+} VkImageAspectFlagBits;
+typedef VkFlags VkImageAspectFlags;
+
+typedef enum VkSparseImageFormatFlagBits {
+    VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT = 0x00000001,
+    VK_SPARSE_IMAGE_FORMAT_ALIGNED_MIP_SIZE_BIT = 0x00000002,
+    VK_SPARSE_IMAGE_FORMAT_NONSTANDARD_BLOCK_SIZE_BIT = 0x00000004,
+} VkSparseImageFormatFlagBits;
+typedef VkFlags VkSparseImageFormatFlags;
+
+typedef enum VkSparseMemoryBindFlagBits {
+    VK_SPARSE_MEMORY_BIND_METADATA_BIT = 0x00000001,
+} VkSparseMemoryBindFlagBits;
+typedef VkFlags VkSparseMemoryBindFlags;
+
+typedef enum VkFenceCreateFlagBits {
+    VK_FENCE_CREATE_SIGNALED_BIT = 0x00000001,
+} VkFenceCreateFlagBits;
+typedef VkFlags VkFenceCreateFlags;
+typedef VkFlags VkSemaphoreCreateFlags;
+typedef VkFlags VkEventCreateFlags;
+typedef VkFlags VkQueryPoolCreateFlags;
+
+typedef enum VkQueryPipelineStatisticFlagBits {
+    VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT = 0x00000001,
+    VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT = 0x00000002,
+    VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT = 0x00000004,
+    VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT = 0x00000008,
+    VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT = 0x00000010,
+    VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT = 0x00000020,
+    VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT = 0x00000040,
+    VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT = 0x00000080,
+    VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT = 0x00000100,
+    VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT = 0x00000200,
+    VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT = 0x00000400,
+} VkQueryPipelineStatisticFlagBits;
+typedef VkFlags VkQueryPipelineStatisticFlags;
+
+typedef enum VkQueryResultFlagBits {
+    VK_QUERY_RESULT_64_BIT = 0x00000001,
+    VK_QUERY_RESULT_WAIT_BIT = 0x00000002,
+    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT = 0x00000004,
+    VK_QUERY_RESULT_PARTIAL_BIT = 0x00000008,
+} VkQueryResultFlagBits;
+typedef VkFlags VkQueryResultFlags;
+
+typedef enum VkBufferCreateFlagBits {
+    VK_BUFFER_CREATE_SPARSE_BINDING_BIT = 0x00000001,
+    VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002,
+    VK_BUFFER_CREATE_SPARSE_ALIASED_BIT = 0x00000004,
+} VkBufferCreateFlagBits;
+typedef VkFlags VkBufferCreateFlags;
+
+typedef enum VkBufferUsageFlagBits {
+    VK_BUFFER_USAGE_TRANSFER_SRC_BIT = 0x00000001,
+    VK_BUFFER_USAGE_TRANSFER_DST_BIT = 0x00000002,
+    VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000004,
+    VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT = 0x00000008,
+    VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT = 0x00000010,
+    VK_BUFFER_USAGE_STORAGE_BUFFER_BIT = 0x00000020,
+    VK_BUFFER_USAGE_INDEX_BUFFER_BIT = 0x00000040,
+    VK_BUFFER_USAGE_VERTEX_BUFFER_BIT = 0x00000080,
+    VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT = 0x00000100,
+} VkBufferUsageFlagBits;
+typedef VkFlags VkBufferUsageFlags;
+typedef VkFlags VkBufferViewCreateFlags;
+typedef VkFlags VkImageViewCreateFlags;
+typedef VkFlags VkShaderModuleCreateFlags;
+typedef VkFlags VkPipelineCacheCreateFlags;
+
+typedef enum VkPipelineCreateFlagBits {
+    VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT = 0x00000001,
+    VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT = 0x00000002,
+    VK_PIPELINE_CREATE_DERIVATIVE_BIT = 0x00000004,
+} VkPipelineCreateFlagBits;
+typedef VkFlags VkPipelineCreateFlags;
+typedef VkFlags VkPipelineShaderStageCreateFlags;
+
+typedef enum VkShaderStageFlagBits {
+    VK_SHADER_STAGE_VERTEX_BIT = 0x00000001,
+    VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT = 0x00000002,
+    VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT = 0x00000004,
+    VK_SHADER_STAGE_GEOMETRY_BIT = 0x00000008,
+    VK_SHADER_STAGE_FRAGMENT_BIT = 0x00000010,
+    VK_SHADER_STAGE_COMPUTE_BIT = 0x00000020,
+    VK_SHADER_STAGE_ALL_GRAPHICS = 0x1F,
+    VK_SHADER_STAGE_ALL = 0x7FFFFFFF,
+} VkShaderStageFlagBits;
+typedef VkFlags VkPipelineVertexInputStateCreateFlags;
+typedef VkFlags VkPipelineInputAssemblyStateCreateFlags;
+typedef VkFlags VkPipelineTessellationStateCreateFlags;
+typedef VkFlags VkPipelineViewportStateCreateFlags;
+typedef VkFlags VkPipelineRasterizationStateCreateFlags;
+
+typedef enum VkCullModeFlagBits {
+    VK_CULL_MODE_NONE = 0,
+    VK_CULL_MODE_FRONT_BIT = 0x00000001,
+    VK_CULL_MODE_BACK_BIT = 0x00000002,
+    VK_CULL_MODE_FRONT_AND_BACK = 0x3,
+} VkCullModeFlagBits;
+typedef VkFlags VkCullModeFlags;
+typedef VkFlags VkPipelineMultisampleStateCreateFlags;
+typedef VkFlags VkPipelineDepthStencilStateCreateFlags;
+typedef VkFlags VkPipelineColorBlendStateCreateFlags;
+
+typedef enum VkColorComponentFlagBits {
+    VK_COLOR_COMPONENT_R_BIT = 0x00000001,
+    VK_COLOR_COMPONENT_G_BIT = 0x00000002,
+    VK_COLOR_COMPONENT_B_BIT = 0x00000004,
+    VK_COLOR_COMPONENT_A_BIT = 0x00000008,
+} VkColorComponentFlagBits;
+typedef VkFlags VkColorComponentFlags;
+typedef VkFlags VkPipelineDynamicStateCreateFlags;
+typedef VkFlags VkPipelineLayoutCreateFlags;
+typedef VkFlags VkShaderStageFlags;
+typedef VkFlags VkSamplerCreateFlags;
+typedef VkFlags VkDescriptorSetLayoutCreateFlags;
+
+typedef enum VkDescriptorPoolCreateFlagBits {
+    VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT = 0x00000001,
+} VkDescriptorPoolCreateFlagBits;
+typedef VkFlags VkDescriptorPoolCreateFlags;
+typedef VkFlags VkDescriptorPoolResetFlags;
+typedef VkFlags VkFramebufferCreateFlags;
+typedef VkFlags VkRenderPassCreateFlags;
+
+typedef enum VkAttachmentDescriptionFlagBits {
+    VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT = 0x00000001,
+} VkAttachmentDescriptionFlagBits;
+typedef VkFlags VkAttachmentDescriptionFlags;
+typedef VkFlags VkSubpassDescriptionFlags;
+
+typedef enum VkAccessFlagBits {
+    VK_ACCESS_INDIRECT_COMMAND_READ_BIT = 0x00000001,
+    VK_ACCESS_INDEX_READ_BIT = 0x00000002,
+    VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT = 0x00000004,
+    VK_ACCESS_UNIFORM_READ_BIT = 0x00000008,
+    VK_ACCESS_INPUT_ATTACHMENT_READ_BIT = 0x00000010,
+    VK_ACCESS_SHADER_READ_BIT = 0x00000020,
+    VK_ACCESS_SHADER_WRITE_BIT = 0x00000040,
+    VK_ACCESS_COLOR_ATTACHMENT_READ_BIT = 0x00000080,
+    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT = 0x00000100,
+    VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT = 0x00000200,
+    VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT = 0x00000400,
+    VK_ACCESS_TRANSFER_READ_BIT = 0x00000800,
+    VK_ACCESS_TRANSFER_WRITE_BIT = 0x00001000,
+    VK_ACCESS_HOST_READ_BIT = 0x00002000,
+    VK_ACCESS_HOST_WRITE_BIT = 0x00004000,
+    VK_ACCESS_MEMORY_READ_BIT = 0x00008000,
+    VK_ACCESS_MEMORY_WRITE_BIT = 0x00010000,
+} VkAccessFlagBits;
+typedef VkFlags VkAccessFlags;
+
+typedef enum VkDependencyFlagBits {
+    VK_DEPENDENCY_BY_REGION_BIT = 0x00000001,
+} VkDependencyFlagBits;
+typedef VkFlags VkDependencyFlags;
+
+typedef enum VkCommandPoolCreateFlagBits {
+    VK_COMMAND_POOL_CREATE_TRANSIENT_BIT = 0x00000001,
+    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT = 0x00000002,
+} VkCommandPoolCreateFlagBits;
+typedef VkFlags VkCommandPoolCreateFlags;
+
+typedef enum VkCommandPoolResetFlagBits {
+    VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT = 0x00000001,
+} VkCommandPoolResetFlagBits;
+typedef VkFlags VkCommandPoolResetFlags;
+
+typedef enum VkCommandBufferUsageFlagBits {
+    VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT = 0x00000001,
+    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT = 0x00000002,
+    VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT = 0x00000004,
+} VkCommandBufferUsageFlagBits;
+typedef VkFlags VkCommandBufferUsageFlags;
+
+typedef enum VkQueryControlFlagBits {
+    VK_QUERY_CONTROL_PRECISE_BIT = 0x00000001,
+} VkQueryControlFlagBits;
+typedef VkFlags VkQueryControlFlags;
+
+typedef enum VkCommandBufferResetFlagBits {
+    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT = 0x00000001,
+} VkCommandBufferResetFlagBits;
+typedef VkFlags VkCommandBufferResetFlags;
+
+typedef enum VkStencilFaceFlagBits {
+    VK_STENCIL_FACE_FRONT_BIT = 0x00000001,
+    VK_STENCIL_FACE_BACK_BIT = 0x00000002,
+    VK_STENCIL_FRONT_AND_BACK = 0x3,
+} VkStencilFaceFlagBits;
+typedef VkFlags VkStencilFaceFlags;
+
+typedef void* (VKAPI_PTR *PFN_vkAllocationFunction)(
+    void*                                       pUserData,
+    size_t                                      size,
+    size_t                                      alignment,
+    VkSystemAllocationScope                     allocationScope);
+
+typedef void* (VKAPI_PTR *PFN_vkReallocationFunction)(
+    void*                                       pUserData,
+    void*                                       pOriginal,
+    size_t                                      size,
+    size_t                                      alignment,
+    VkSystemAllocationScope                     allocationScope);
+
+typedef void (VKAPI_PTR *PFN_vkFreeFunction)(
+    void*                                       pUserData,
+    void*                                       pMemory);
+
+typedef void (VKAPI_PTR *PFN_vkInternalAllocationNotification)(
+    void*                                       pUserData,
+    size_t                                      size,
+    VkInternalAllocationType                    allocationType,
+    VkSystemAllocationScope                     allocationScope);
+
+typedef void (VKAPI_PTR *PFN_vkInternalFreeNotification)(
+    void*                                       pUserData,
+    size_t                                      size,
+    VkInternalAllocationType                    allocationType,
+    VkSystemAllocationScope                     allocationScope);
+
+typedef void (VKAPI_PTR *PFN_vkVoidFunction)(void);
+
+typedef struct VkApplicationInfo {
+    VkStructureType    sType;
+    const void*        pNext;
+    const char*        pApplicationName;
+    uint32_t           applicationVersion;
+    const char*        pEngineName;
+    uint32_t           engineVersion;
+    uint32_t           apiVersion;
+} VkApplicationInfo;
+
+typedef struct VkInstanceCreateInfo {
+    VkStructureType             sType;
+    const void*                 pNext;
+    VkInstanceCreateFlags       flags;
+    const VkApplicationInfo*    pApplicationInfo;
+    uint32_t                    enabledLayerCount;
+    const char* const*          ppEnabledLayerNames;
+    uint32_t                    enabledExtensionCount;
+    const char* const*          ppEnabledExtensionNames;
+} VkInstanceCreateInfo;
+
+typedef struct VkAllocationCallbacks {
+    void*                                   pUserData;
+    PFN_vkAllocationFunction                pfnAllocation;
+    PFN_vkReallocationFunction              pfnReallocation;
+    PFN_vkFreeFunction                      pfnFree;
+    PFN_vkInternalAllocationNotification    pfnInternalAllocation;
+    PFN_vkInternalFreeNotification          pfnInternalFree;
+} VkAllocationCallbacks;
+
+typedef struct VkPhysicalDeviceFeatures {
+    VkBool32    robustBufferAccess;
+    VkBool32    fullDrawIndexUint32;
+    VkBool32    imageCubeArray;
+    VkBool32    independentBlend;
+    VkBool32    geometryShader;
+    VkBool32    tessellationShader;
+    VkBool32    sampleRateShading;
+    VkBool32    dualSrcBlend;
+    VkBool32    logicOp;
+    VkBool32    multiDrawIndirect;
+    VkBool32    drawIndirectFirstInstance;
+    VkBool32    depthClamp;
+    VkBool32    depthBiasClamp;
+    VkBool32    fillModeNonSolid;
+    VkBool32    depthBounds;
+    VkBool32    wideLines;
+    VkBool32    largePoints;
+    VkBool32    alphaToOne;
+    VkBool32    multiViewport;
+    VkBool32    samplerAnisotropy;
+    VkBool32    textureCompressionETC2;
+    VkBool32    textureCompressionASTC_LDR;
+    VkBool32    textureCompressionBC;
+    VkBool32    occlusionQueryPrecise;
+    VkBool32    pipelineStatisticsQuery;
+    VkBool32    vertexPipelineStoresAndAtomics;
+    VkBool32    fragmentStoresAndAtomics;
+    VkBool32    shaderTessellationAndGeometryPointSize;
+    VkBool32    shaderImageGatherExtended;
+    VkBool32    shaderStorageImageExtendedFormats;
+    VkBool32    shaderStorageImageMultisample;
+    VkBool32    shaderStorageImageReadWithoutFormat;
+    VkBool32    shaderStorageImageWriteWithoutFormat;
+    VkBool32    shaderUniformBufferArrayDynamicIndexing;
+    VkBool32    shaderSampledImageArrayDynamicIndexing;
+    VkBool32    shaderStorageBufferArrayDynamicIndexing;
+    VkBool32    shaderStorageImageArrayDynamicIndexing;
+    VkBool32    shaderClipDistance;
+    VkBool32    shaderCullDistance;
+    VkBool32    shaderFloat64;
+    VkBool32    shaderInt64;
+    VkBool32    shaderInt16;
+    VkBool32    shaderResourceResidency;
+    VkBool32    shaderResourceMinLod;
+    VkBool32    sparseBinding;
+    VkBool32    sparseResidencyBuffer;
+    VkBool32    sparseResidencyImage2D;
+    VkBool32    sparseResidencyImage3D;
+    VkBool32    sparseResidency2Samples;
+    VkBool32    sparseResidency4Samples;
+    VkBool32    sparseResidency8Samples;
+    VkBool32    sparseResidency16Samples;
+    VkBool32    sparseResidencyAliased;
+    VkBool32    variableMultisampleRate;
+    VkBool32    inheritedQueries;
+} VkPhysicalDeviceFeatures;
+
+typedef struct VkFormatProperties {
+    VkFormatFeatureFlags    linearTilingFeatures;
+    VkFormatFeatureFlags    optimalTilingFeatures;
+    VkFormatFeatureFlags    bufferFeatures;
+} VkFormatProperties;
+
+typedef struct VkExtent3D {
+    uint32_t    width;
+    uint32_t    height;
+    uint32_t    depth;
+} VkExtent3D;
+
+typedef struct VkImageFormatProperties {
+    VkExtent3D            maxExtent;
+    uint32_t              maxMipLevels;
+    uint32_t              maxArrayLayers;
+    VkSampleCountFlags    sampleCounts;
+    VkDeviceSize          maxResourceSize;
+} VkImageFormatProperties;
+
+typedef struct VkPhysicalDeviceLimits {
+    uint32_t              maxImageDimension1D;
+    uint32_t              maxImageDimension2D;
+    uint32_t              maxImageDimension3D;
+    uint32_t              maxImageDimensionCube;
+    uint32_t              maxImageArrayLayers;
+    uint32_t              maxTexelBufferElements;
+    uint32_t              maxUniformBufferRange;
+    uint32_t              maxStorageBufferRange;
+    uint32_t              maxPushConstantsSize;
+    uint32_t              maxMemoryAllocationCount;
+    uint32_t              maxSamplerAllocationCount;
+    VkDeviceSize          bufferImageGranularity;
+    VkDeviceSize          sparseAddressSpaceSize;
+    uint32_t              maxBoundDescriptorSets;
+    uint32_t              maxPerStageDescriptorSamplers;
+    uint32_t              maxPerStageDescriptorUniformBuffers;
+    uint32_t              maxPerStageDescriptorStorageBuffers;
+    uint32_t              maxPerStageDescriptorSampledImages;
+    uint32_t              maxPerStageDescriptorStorageImages;
+    uint32_t              maxPerStageDescriptorInputAttachments;
+    uint32_t              maxPerStageResources;
+    uint32_t              maxDescriptorSetSamplers;
+    uint32_t              maxDescriptorSetUniformBuffers;
+    uint32_t              maxDescriptorSetUniformBuffersDynamic;
+    uint32_t              maxDescriptorSetStorageBuffers;
+    uint32_t              maxDescriptorSetStorageBuffersDynamic;
+    uint32_t              maxDescriptorSetSampledImages;
+    uint32_t              maxDescriptorSetStorageImages;
+    uint32_t              maxDescriptorSetInputAttachments;
+    uint32_t              maxVertexInputAttributes;
+    uint32_t              maxVertexInputBindings;
+    uint32_t              maxVertexInputAttributeOffset;
+    uint32_t              maxVertexInputBindingStride;
+    uint32_t              maxVertexOutputComponents;
+    uint32_t              maxTessellationGenerationLevel;
+    uint32_t              maxTessellationPatchSize;
+    uint32_t              maxTessellationControlPerVertexInputComponents;
+    uint32_t              maxTessellationControlPerVertexOutputComponents;
+    uint32_t              maxTessellationControlPerPatchOutputComponents;
+    uint32_t              maxTessellationControlTotalOutputComponents;
+    uint32_t              maxTessellationEvaluationInputComponents;
+    uint32_t              maxTessellationEvaluationOutputComponents;
+    uint32_t              maxGeometryShaderInvocations;
+    uint32_t              maxGeometryInputComponents;
+    uint32_t              maxGeometryOutputComponents;
+    uint32_t              maxGeometryOutputVertices;
+    uint32_t              maxGeometryTotalOutputComponents;
+    uint32_t              maxFragmentInputComponents;
+    uint32_t              maxFragmentOutputAttachments;
+    uint32_t              maxFragmentDualSrcAttachments;
+    uint32_t              maxFragmentCombinedOutputResources;
+    uint32_t              maxComputeSharedMemorySize;
+    uint32_t              maxComputeWorkGroupCount[3];
+    uint32_t              maxComputeWorkGroupInvocations;
+    uint32_t              maxComputeWorkGroupSize[3];
+    uint32_t              subPixelPrecisionBits;
+    uint32_t              subTexelPrecisionBits;
+    uint32_t              mipmapPrecisionBits;
+    uint32_t              maxDrawIndexedIndexValue;
+    uint32_t              maxDrawIndirectCount;
+    float                 maxSamplerLodBias;
+    float                 maxSamplerAnisotropy;
+    uint32_t              maxViewports;
+    uint32_t              maxViewportDimensions[2];
+    float                 viewportBoundsRange[2];
+    uint32_t              viewportSubPixelBits;
+    size_t                minMemoryMapAlignment;
+    VkDeviceSize          minTexelBufferOffsetAlignment;
+    VkDeviceSize          minUniformBufferOffsetAlignment;
+    VkDeviceSize          minStorageBufferOffsetAlignment;
+    int32_t               minTexelOffset;
+    uint32_t              maxTexelOffset;
+    int32_t               minTexelGatherOffset;
+    uint32_t              maxTexelGatherOffset;
+    float                 minInterpolationOffset;
+    float                 maxInterpolationOffset;
+    uint32_t              subPixelInterpolationOffsetBits;
+    uint32_t              maxFramebufferWidth;
+    uint32_t              maxFramebufferHeight;
+    uint32_t              maxFramebufferLayers;
+    VkSampleCountFlags    framebufferColorSampleCounts;
+    VkSampleCountFlags    framebufferDepthSampleCounts;
+    VkSampleCountFlags    framebufferStencilSampleCounts;
+    VkSampleCountFlags    framebufferNoAttachmentsSampleCounts;
+    uint32_t              maxColorAttachments;
+    VkSampleCountFlags    sampledImageColorSampleCounts;
+    VkSampleCountFlags    sampledImageIntegerSampleCounts;
+    VkSampleCountFlags    sampledImageDepthSampleCounts;
+    VkSampleCountFlags    sampledImageStencilSampleCounts;
+    VkSampleCountFlags    storageImageSampleCounts;
+    uint32_t              maxSampleMaskWords;
+    VkBool32              timestampComputeAndGraphics;
+    float                 timestampPeriod;
+    uint32_t              maxClipDistances;
+    uint32_t              maxCullDistances;
+    uint32_t              maxCombinedClipAndCullDistances;
+    uint32_t              discreteQueuePriorities;
+    float                 pointSizeRange[2];
+    float                 lineWidthRange[2];
+    float                 pointSizeGranularity;
+    float                 lineWidthGranularity;
+    VkBool32              strictLines;
+    VkBool32              standardSampleLocations;
+    VkDeviceSize          optimalBufferCopyOffsetAlignment;
+    VkDeviceSize          optimalBufferCopyRowPitchAlignment;
+    VkDeviceSize          nonCoherentAtomSize;
+} VkPhysicalDeviceLimits;
+
+typedef struct VkPhysicalDeviceSparseProperties {
+    VkBool32    residencyStandard2DBlockShape;
+    VkBool32    residencyStandard2DMultisampleBlockShape;
+    VkBool32    residencyStandard3DBlockShape;
+    VkBool32    residencyAlignedMipSize;
+    VkBool32    residencyNonResidentStrict;
+} VkPhysicalDeviceSparseProperties;
+
+typedef struct VkPhysicalDeviceProperties {
+    uint32_t                            apiVersion;
+    uint32_t                            driverVersion;
+    uint32_t                            vendorID;
+    uint32_t                            deviceID;
+    VkPhysicalDeviceType                deviceType;
+    char                                deviceName[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
+    uint8_t                             pipelineCacheUUID[VK_UUID_SIZE];
+    VkPhysicalDeviceLimits              limits;
+    VkPhysicalDeviceSparseProperties    sparseProperties;
+} VkPhysicalDeviceProperties;
+
+typedef struct VkQueueFamilyProperties {
+    VkQueueFlags    queueFlags;
+    uint32_t        queueCount;
+    uint32_t        timestampValidBits;
+    VkExtent3D      minImageTransferGranularity;
+} VkQueueFamilyProperties;
+
+typedef struct VkMemoryType {
+    VkMemoryPropertyFlags    propertyFlags;
+    uint32_t                 heapIndex;
+} VkMemoryType;
+
+typedef struct VkMemoryHeap {
+    VkDeviceSize         size;
+    VkMemoryHeapFlags    flags;
+} VkMemoryHeap;
+
+typedef struct VkPhysicalDeviceMemoryProperties {
+    uint32_t        memoryTypeCount;
+    VkMemoryType    memoryTypes[VK_MAX_MEMORY_TYPES];
+    uint32_t        memoryHeapCount;
+    VkMemoryHeap    memoryHeaps[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryProperties;
+
+typedef struct VkDeviceQueueCreateInfo {
+    VkStructureType             sType;
+    const void*                 pNext;
+    VkDeviceQueueCreateFlags    flags;
+    uint32_t                    queueFamilyIndex;
+    uint32_t                    queueCount;
+    const float*                pQueuePriorities;
+} VkDeviceQueueCreateInfo;
+
+typedef struct VkDeviceCreateInfo {
+    VkStructureType                    sType;
+    const void*                        pNext;
+    VkDeviceCreateFlags                flags;
+    uint32_t                           queueCreateInfoCount;
+    const VkDeviceQueueCreateInfo*     pQueueCreateInfos;
+    uint32_t                           enabledLayerCount;
+    const char* const*                 ppEnabledLayerNames;
+    uint32_t                           enabledExtensionCount;
+    const char* const*                 ppEnabledExtensionNames;
+    const VkPhysicalDeviceFeatures*    pEnabledFeatures;
+} VkDeviceCreateInfo;
+
+typedef struct VkExtensionProperties {
+    char        extensionName[VK_MAX_EXTENSION_NAME_SIZE];
+    uint32_t    specVersion;
+} VkExtensionProperties;
+
+typedef struct VkLayerProperties {
+    char        layerName[VK_MAX_EXTENSION_NAME_SIZE];
+    uint32_t    specVersion;
+    uint32_t    implementationVersion;
+    char        description[VK_MAX_DESCRIPTION_SIZE];
+} VkLayerProperties;
+
+typedef struct VkSubmitInfo {
+    VkStructureType                sType;
+    const void*                    pNext;
+    uint32_t                       waitSemaphoreCount;
+    const VkSemaphore*             pWaitSemaphores;
+    const VkPipelineStageFlags*    pWaitDstStageMask;
+    uint32_t                       commandBufferCount;
+    const VkCommandBuffer*         pCommandBuffers;
+    uint32_t                       signalSemaphoreCount;
+    const VkSemaphore*             pSignalSemaphores;
+} VkSubmitInfo;
+
+typedef struct VkMemoryAllocateInfo {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkDeviceSize       allocationSize;
+    uint32_t           memoryTypeIndex;
+} VkMemoryAllocateInfo;
+
+typedef struct VkMappedMemoryRange {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkDeviceMemory     memory;
+    VkDeviceSize       offset;
+    VkDeviceSize       size;
+} VkMappedMemoryRange;
+
+typedef struct VkMemoryRequirements {
+    VkDeviceSize    size;
+    VkDeviceSize    alignment;
+    uint32_t        memoryTypeBits;
+} VkMemoryRequirements;
+
+typedef struct VkSparseImageFormatProperties {
+    VkImageAspectFlags          aspectMask;
+    VkExtent3D                  imageGranularity;
+    VkSparseImageFormatFlags    flags;
+} VkSparseImageFormatProperties;
+
+typedef struct VkSparseImageMemoryRequirements {
+    VkSparseImageFormatProperties    formatProperties;
+    uint32_t                         imageMipTailFirstLod;
+    VkDeviceSize                     imageMipTailSize;
+    VkDeviceSize                     imageMipTailOffset;
+    VkDeviceSize                     imageMipTailStride;
+} VkSparseImageMemoryRequirements;
+
+typedef struct VkSparseMemoryBind {
+    VkDeviceSize               resourceOffset;
+    VkDeviceSize               size;
+    VkDeviceMemory             memory;
+    VkDeviceSize               memoryOffset;
+    VkSparseMemoryBindFlags    flags;
+} VkSparseMemoryBind;
+
+typedef struct VkSparseBufferMemoryBindInfo {
+    VkBuffer                     buffer;
+    uint32_t                     bindCount;
+    const VkSparseMemoryBind*    pBinds;
+} VkSparseBufferMemoryBindInfo;
+
+typedef struct VkSparseImageOpaqueMemoryBindInfo {
+    VkImage                      image;
+    uint32_t                     bindCount;
+    const VkSparseMemoryBind*    pBinds;
+} VkSparseImageOpaqueMemoryBindInfo;
+
+typedef struct VkImageSubresource {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              mipLevel;
+    uint32_t              arrayLayer;
+} VkImageSubresource;
+
+typedef struct VkOffset3D {
+    int32_t    x;
+    int32_t    y;
+    int32_t    z;
+} VkOffset3D;
+
+typedef struct VkSparseImageMemoryBind {
+    VkImageSubresource         subresource;
+    VkOffset3D                 offset;
+    VkExtent3D                 extent;
+    VkDeviceMemory             memory;
+    VkDeviceSize               memoryOffset;
+    VkSparseMemoryBindFlags    flags;
+} VkSparseImageMemoryBind;
+
+typedef struct VkSparseImageMemoryBindInfo {
+    VkImage                           image;
+    uint32_t                          bindCount;
+    const VkSparseImageMemoryBind*    pBinds;
+} VkSparseImageMemoryBindInfo;
+
+typedef struct VkBindSparseInfo {
+    VkStructureType                             sType;
+    const void*                                 pNext;
+    uint32_t                                    waitSemaphoreCount;
+    const VkSemaphore*                          pWaitSemaphores;
+    uint32_t                                    bufferBindCount;
+    const VkSparseBufferMemoryBindInfo*         pBufferBinds;
+    uint32_t                                    imageOpaqueBindCount;
+    const VkSparseImageOpaqueMemoryBindInfo*    pImageOpaqueBinds;
+    uint32_t                                    imageBindCount;
+    const VkSparseImageMemoryBindInfo*          pImageBinds;
+    uint32_t                                    signalSemaphoreCount;
+    const VkSemaphore*                          pSignalSemaphores;
+} VkBindSparseInfo;
+
+typedef struct VkFenceCreateInfo {
+    VkStructureType       sType;
+    const void*           pNext;
+    VkFenceCreateFlags    flags;
+} VkFenceCreateInfo;
+
+typedef struct VkSemaphoreCreateInfo {
+    VkStructureType           sType;
+    const void*               pNext;
+    VkSemaphoreCreateFlags    flags;
+} VkSemaphoreCreateInfo;
+
+typedef struct VkEventCreateInfo {
+    VkStructureType       sType;
+    const void*           pNext;
+    VkEventCreateFlags    flags;
+} VkEventCreateInfo;
+
+typedef struct VkQueryPoolCreateInfo {
+    VkStructureType                  sType;
+    const void*                      pNext;
+    VkQueryPoolCreateFlags           flags;
+    VkQueryType                      queryType;
+    uint32_t                         queryCount;
+    VkQueryPipelineStatisticFlags    pipelineStatistics;
+} VkQueryPoolCreateInfo;
+
+typedef struct VkBufferCreateInfo {
+    VkStructureType        sType;
+    const void*            pNext;
+    VkBufferCreateFlags    flags;
+    VkDeviceSize           size;
+    VkBufferUsageFlags     usage;
+    VkSharingMode          sharingMode;
+    uint32_t               queueFamilyIndexCount;
+    const uint32_t*        pQueueFamilyIndices;
+} VkBufferCreateInfo;
+
+typedef struct VkBufferViewCreateInfo {
+    VkStructureType            sType;
+    const void*                pNext;
+    VkBufferViewCreateFlags    flags;
+    VkBuffer                   buffer;
+    VkFormat                   format;
+    VkDeviceSize               offset;
+    VkDeviceSize               range;
+} VkBufferViewCreateInfo;
+
+typedef struct VkImageCreateInfo {
+    VkStructureType          sType;
+    const void*              pNext;
+    VkImageCreateFlags       flags;
+    VkImageType              imageType;
+    VkFormat                 format;
+    VkExtent3D               extent;
+    uint32_t                 mipLevels;
+    uint32_t                 arrayLayers;
+    VkSampleCountFlagBits    samples;
+    VkImageTiling            tiling;
+    VkImageUsageFlags        usage;
+    VkSharingMode            sharingMode;
+    uint32_t                 queueFamilyIndexCount;
+    const uint32_t*          pQueueFamilyIndices;
+    VkImageLayout            initialLayout;
+} VkImageCreateInfo;
+
+typedef struct VkSubresourceLayout {
+    VkDeviceSize    offset;
+    VkDeviceSize    size;
+    VkDeviceSize    rowPitch;
+    VkDeviceSize    arrayPitch;
+    VkDeviceSize    depthPitch;
+} VkSubresourceLayout;
+
+typedef struct VkComponentMapping {
+    VkComponentSwizzle    r;
+    VkComponentSwizzle    g;
+    VkComponentSwizzle    b;
+    VkComponentSwizzle    a;
+} VkComponentMapping;
+
+typedef struct VkImageSubresourceRange {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              baseMipLevel;
+    uint32_t              levelCount;
+    uint32_t              baseArrayLayer;
+    uint32_t              layerCount;
+} VkImageSubresourceRange;
+
+typedef struct VkImageViewCreateInfo {
+    VkStructureType            sType;
+    const void*                pNext;
+    VkImageViewCreateFlags     flags;
+    VkImage                    image;
+    VkImageViewType            viewType;
+    VkFormat                   format;
+    VkComponentMapping         components;
+    VkImageSubresourceRange    subresourceRange;
+} VkImageViewCreateInfo;
+
+typedef struct VkShaderModuleCreateInfo {
+    VkStructureType              sType;
+    const void*                  pNext;
+    VkShaderModuleCreateFlags    flags;
+    size_t                       codeSize;
+    const uint32_t*              pCode;
+} VkShaderModuleCreateInfo;
+
+typedef struct VkPipelineCacheCreateInfo {
+    VkStructureType               sType;
+    const void*                   pNext;
+    VkPipelineCacheCreateFlags    flags;
+    size_t                        initialDataSize;
+    const void*                   pInitialData;
+} VkPipelineCacheCreateInfo;
+
+typedef struct VkSpecializationMapEntry {
+    uint32_t    constantID;
+    uint32_t    offset;
+    size_t      size;
+} VkSpecializationMapEntry;
+
+typedef struct VkSpecializationInfo {
+    uint32_t                           mapEntryCount;
+    const VkSpecializationMapEntry*    pMapEntries;
+    size_t                             dataSize;
+    const void*                        pData;
+} VkSpecializationInfo;
+
+typedef struct VkPipelineShaderStageCreateInfo {
+    VkStructureType                     sType;
+    const void*                         pNext;
+    VkPipelineShaderStageCreateFlags    flags;
+    VkShaderStageFlagBits               stage;
+    VkShaderModule                      module;
+    const char*                         pName;
+    const VkSpecializationInfo*         pSpecializationInfo;
+} VkPipelineShaderStageCreateInfo;
+
+typedef struct VkVertexInputBindingDescription {
+    uint32_t             binding;
+    uint32_t             stride;
+    VkVertexInputRate    inputRate;
+} VkVertexInputBindingDescription;
+
+typedef struct VkVertexInputAttributeDescription {
+    uint32_t    location;
+    uint32_t    binding;
+    VkFormat    format;
+    uint32_t    offset;
+} VkVertexInputAttributeDescription;
+
+typedef struct VkPipelineVertexInputStateCreateInfo {
+    VkStructureType                             sType;
+    const void*                                 pNext;
+    VkPipelineVertexInputStateCreateFlags       flags;
+    uint32_t                                    vertexBindingDescriptionCount;
+    const VkVertexInputBindingDescription*      pVertexBindingDescriptions;
+    uint32_t                                    vertexAttributeDescriptionCount;
+    const VkVertexInputAttributeDescription*    pVertexAttributeDescriptions;
+} VkPipelineVertexInputStateCreateInfo;
+
+typedef struct VkPipelineInputAssemblyStateCreateInfo {
+    VkStructureType                            sType;
+    const void*                                pNext;
+    VkPipelineInputAssemblyStateCreateFlags    flags;
+    VkPrimitiveTopology                        topology;
+    VkBool32                                   primitiveRestartEnable;
+} VkPipelineInputAssemblyStateCreateInfo;
+
+typedef struct VkPipelineTessellationStateCreateInfo {
+    VkStructureType                           sType;
+    const void*                               pNext;
+    VkPipelineTessellationStateCreateFlags    flags;
+    uint32_t                                  patchControlPoints;
+} VkPipelineTessellationStateCreateInfo;
+
+typedef struct VkViewport {
+    float    x;
+    float    y;
+    float    width;
+    float    height;
+    float    minDepth;
+    float    maxDepth;
+} VkViewport;
+
+typedef struct VkOffset2D {
+    int32_t    x;
+    int32_t    y;
+} VkOffset2D;
+
+typedef struct VkExtent2D {
+    uint32_t    width;
+    uint32_t    height;
+} VkExtent2D;
+
+typedef struct VkRect2D {
+    VkOffset2D    offset;
+    VkExtent2D    extent;
+} VkRect2D;
+
+typedef struct VkPipelineViewportStateCreateInfo {
+    VkStructureType                       sType;
+    const void*                           pNext;
+    VkPipelineViewportStateCreateFlags    flags;
+    uint32_t                              viewportCount;
+    const VkViewport*                     pViewports;
+    uint32_t                              scissorCount;
+    const VkRect2D*                       pScissors;
+} VkPipelineViewportStateCreateInfo;
+
+typedef struct VkPipelineRasterizationStateCreateInfo {
+    VkStructureType                            sType;
+    const void*                                pNext;
+    VkPipelineRasterizationStateCreateFlags    flags;
+    VkBool32                                   depthClampEnable;
+    VkBool32                                   rasterizerDiscardEnable;
+    VkPolygonMode                              polygonMode;
+    VkCullModeFlags                            cullMode;
+    VkFrontFace                                frontFace;
+    VkBool32                                   depthBiasEnable;
+    float                                      depthBiasConstantFactor;
+    float                                      depthBiasClamp;
+    float                                      depthBiasSlopeFactor;
+    float                                      lineWidth;
+} VkPipelineRasterizationStateCreateInfo;
+
+typedef struct VkPipelineMultisampleStateCreateInfo {
+    VkStructureType                          sType;
+    const void*                              pNext;
+    VkPipelineMultisampleStateCreateFlags    flags;
+    VkSampleCountFlagBits                    rasterizationSamples;
+    VkBool32                                 sampleShadingEnable;
+    float                                    minSampleShading;
+    const VkSampleMask*                      pSampleMask;
+    VkBool32                                 alphaToCoverageEnable;
+    VkBool32                                 alphaToOneEnable;
+} VkPipelineMultisampleStateCreateInfo;
+
+typedef struct VkStencilOpState {
+    VkStencilOp    failOp;
+    VkStencilOp    passOp;
+    VkStencilOp    depthFailOp;
+    VkCompareOp    compareOp;
+    uint32_t       compareMask;
+    uint32_t       writeMask;
+    uint32_t       reference;
+} VkStencilOpState;
+
+typedef struct VkPipelineDepthStencilStateCreateInfo {
+    VkStructureType                           sType;
+    const void*                               pNext;
+    VkPipelineDepthStencilStateCreateFlags    flags;
+    VkBool32                                  depthTestEnable;
+    VkBool32                                  depthWriteEnable;
+    VkCompareOp                               depthCompareOp;
+    VkBool32                                  depthBoundsTestEnable;
+    VkBool32                                  stencilTestEnable;
+    VkStencilOpState                          front;
+    VkStencilOpState                          back;
+    float                                     minDepthBounds;
+    float                                     maxDepthBounds;
+} VkPipelineDepthStencilStateCreateInfo;
+
+typedef struct VkPipelineColorBlendAttachmentState {
+    VkBool32                 blendEnable;
+    VkBlendFactor            srcColorBlendFactor;
+    VkBlendFactor            dstColorBlendFactor;
+    VkBlendOp                colorBlendOp;
+    VkBlendFactor            srcAlphaBlendFactor;
+    VkBlendFactor            dstAlphaBlendFactor;
+    VkBlendOp                alphaBlendOp;
+    VkColorComponentFlags    colorWriteMask;
+} VkPipelineColorBlendAttachmentState;
+
+typedef struct VkPipelineColorBlendStateCreateInfo {
+    VkStructureType                               sType;
+    const void*                                   pNext;
+    VkPipelineColorBlendStateCreateFlags          flags;
+    VkBool32                                      logicOpEnable;
+    VkLogicOp                                     logicOp;
+    uint32_t                                      attachmentCount;
+    const VkPipelineColorBlendAttachmentState*    pAttachments;
+    float                                         blendConstants[4];
+} VkPipelineColorBlendStateCreateInfo;
+
+typedef struct VkPipelineDynamicStateCreateInfo {
+    VkStructureType                      sType;
+    const void*                          pNext;
+    VkPipelineDynamicStateCreateFlags    flags;
+    uint32_t                             dynamicStateCount;
+    const VkDynamicState*                pDynamicStates;
+} VkPipelineDynamicStateCreateInfo;
+
+typedef struct VkGraphicsPipelineCreateInfo {
+    VkStructureType                                  sType;
+    const void*                                      pNext;
+    VkPipelineCreateFlags                            flags;
+    uint32_t                                         stageCount;
+    const VkPipelineShaderStageCreateInfo*           pStages;
+    const VkPipelineVertexInputStateCreateInfo*      pVertexInputState;
+    const VkPipelineInputAssemblyStateCreateInfo*    pInputAssemblyState;
+    const VkPipelineTessellationStateCreateInfo*     pTessellationState;
+    const VkPipelineViewportStateCreateInfo*         pViewportState;
+    const VkPipelineRasterizationStateCreateInfo*    pRasterizationState;
+    const VkPipelineMultisampleStateCreateInfo*      pMultisampleState;
+    const VkPipelineDepthStencilStateCreateInfo*     pDepthStencilState;
+    const VkPipelineColorBlendStateCreateInfo*       pColorBlendState;
+    const VkPipelineDynamicStateCreateInfo*          pDynamicState;
+    VkPipelineLayout                                 layout;
+    VkRenderPass                                     renderPass;
+    uint32_t                                         subpass;
+    VkPipeline                                       basePipelineHandle;
+    int32_t                                          basePipelineIndex;
+} VkGraphicsPipelineCreateInfo;
+
+typedef struct VkComputePipelineCreateInfo {
+    VkStructureType                    sType;
+    const void*                        pNext;
+    VkPipelineCreateFlags              flags;
+    VkPipelineShaderStageCreateInfo    stage;
+    VkPipelineLayout                   layout;
+    VkPipeline                         basePipelineHandle;
+    int32_t                            basePipelineIndex;
+} VkComputePipelineCreateInfo;
+
+typedef struct VkPushConstantRange {
+    VkShaderStageFlags    stageFlags;
+    uint32_t              offset;
+    uint32_t              size;
+} VkPushConstantRange;
+
+typedef struct VkPipelineLayoutCreateInfo {
+    VkStructureType                 sType;
+    const void*                     pNext;
+    VkPipelineLayoutCreateFlags     flags;
+    uint32_t                        setLayoutCount;
+    const VkDescriptorSetLayout*    pSetLayouts;
+    uint32_t                        pushConstantRangeCount;
+    const VkPushConstantRange*      pPushConstantRanges;
+} VkPipelineLayoutCreateInfo;
+
+typedef struct VkSamplerCreateInfo {
+    VkStructureType         sType;
+    const void*             pNext;
+    VkSamplerCreateFlags    flags;
+    VkFilter                magFilter;
+    VkFilter                minFilter;
+    VkSamplerMipmapMode     mipmapMode;
+    VkSamplerAddressMode    addressModeU;
+    VkSamplerAddressMode    addressModeV;
+    VkSamplerAddressMode    addressModeW;
+    float                   mipLodBias;
+    VkBool32                anisotropyEnable;
+    float                   maxAnisotropy;
+    VkBool32                compareEnable;
+    VkCompareOp             compareOp;
+    float                   minLod;
+    float                   maxLod;
+    VkBorderColor           borderColor;
+    VkBool32                unnormalizedCoordinates;
+} VkSamplerCreateInfo;
+
+typedef struct VkDescriptorSetLayoutBinding {
+    uint32_t              binding;
+    VkDescriptorType      descriptorType;
+    uint32_t              descriptorCount;
+    VkShaderStageFlags    stageFlags;
+    const VkSampler*      pImmutableSamplers;
+} VkDescriptorSetLayoutBinding;
+
+typedef struct VkDescriptorSetLayoutCreateInfo {
+    VkStructureType                        sType;
+    const void*                            pNext;
+    VkDescriptorSetLayoutCreateFlags       flags;
+    uint32_t                               bindingCount;
+    const VkDescriptorSetLayoutBinding*    pBindings;
+} VkDescriptorSetLayoutCreateInfo;
+
+typedef struct VkDescriptorPoolSize {
+    VkDescriptorType    type;
+    uint32_t            descriptorCount;
+} VkDescriptorPoolSize;
+
+typedef struct VkDescriptorPoolCreateInfo {
+    VkStructureType                sType;
+    const void*                    pNext;
+    VkDescriptorPoolCreateFlags    flags;
+    uint32_t                       maxSets;
+    uint32_t                       poolSizeCount;
+    const VkDescriptorPoolSize*    pPoolSizes;
+} VkDescriptorPoolCreateInfo;
+
+typedef struct VkDescriptorSetAllocateInfo {
+    VkStructureType                 sType;
+    const void*                     pNext;
+    VkDescriptorPool                descriptorPool;
+    uint32_t                        descriptorSetCount;
+    const VkDescriptorSetLayout*    pSetLayouts;
+} VkDescriptorSetAllocateInfo;
+
+typedef struct VkDescriptorImageInfo {
+    VkSampler        sampler;
+    VkImageView      imageView;
+    VkImageLayout    imageLayout;
+} VkDescriptorImageInfo;
+
+typedef struct VkDescriptorBufferInfo {
+    VkBuffer        buffer;
+    VkDeviceSize    offset;
+    VkDeviceSize    range;
+} VkDescriptorBufferInfo;
+
+typedef struct VkWriteDescriptorSet {
+    VkStructureType                  sType;
+    const void*                      pNext;
+    VkDescriptorSet                  dstSet;
+    uint32_t                         dstBinding;
+    uint32_t                         dstArrayElement;
+    uint32_t                         descriptorCount;
+    VkDescriptorType                 descriptorType;
+    const VkDescriptorImageInfo*     pImageInfo;
+    const VkDescriptorBufferInfo*    pBufferInfo;
+    const VkBufferView*              pTexelBufferView;
+} VkWriteDescriptorSet;
+
+typedef struct VkCopyDescriptorSet {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkDescriptorSet    srcSet;
+    uint32_t           srcBinding;
+    uint32_t           srcArrayElement;
+    VkDescriptorSet    dstSet;
+    uint32_t           dstBinding;
+    uint32_t           dstArrayElement;
+    uint32_t           descriptorCount;
+} VkCopyDescriptorSet;
+
+typedef struct VkFramebufferCreateInfo {
+    VkStructureType             sType;
+    const void*                 pNext;
+    VkFramebufferCreateFlags    flags;
+    VkRenderPass                renderPass;
+    uint32_t                    attachmentCount;
+    const VkImageView*          pAttachments;
+    uint32_t                    width;
+    uint32_t                    height;
+    uint32_t                    layers;
+} VkFramebufferCreateInfo;
+
+typedef struct VkAttachmentDescription {
+    VkAttachmentDescriptionFlags    flags;
+    VkFormat                        format;
+    VkSampleCountFlagBits           samples;
+    VkAttachmentLoadOp              loadOp;
+    VkAttachmentStoreOp             storeOp;
+    VkAttachmentLoadOp              stencilLoadOp;
+    VkAttachmentStoreOp             stencilStoreOp;
+    VkImageLayout                   initialLayout;
+    VkImageLayout                   finalLayout;
+} VkAttachmentDescription;
+
+typedef struct VkAttachmentReference {
+    uint32_t         attachment;
+    VkImageLayout    layout;
+} VkAttachmentReference;
+
+typedef struct VkSubpassDescription {
+    VkSubpassDescriptionFlags       flags;
+    VkPipelineBindPoint             pipelineBindPoint;
+    uint32_t                        inputAttachmentCount;
+    const VkAttachmentReference*    pInputAttachments;
+    uint32_t                        colorAttachmentCount;
+    const VkAttachmentReference*    pColorAttachments;
+    const VkAttachmentReference*    pResolveAttachments;
+    const VkAttachmentReference*    pDepthStencilAttachment;
+    uint32_t                        preserveAttachmentCount;
+    const uint32_t*                 pPreserveAttachments;
+} VkSubpassDescription;
+
+typedef struct VkSubpassDependency {
+    uint32_t                srcSubpass;
+    uint32_t                dstSubpass;
+    VkPipelineStageFlags    srcStageMask;
+    VkPipelineStageFlags    dstStageMask;
+    VkAccessFlags           srcAccessMask;
+    VkAccessFlags           dstAccessMask;
+    VkDependencyFlags       dependencyFlags;
+} VkSubpassDependency;
+
+typedef struct VkRenderPassCreateInfo {
+    VkStructureType                   sType;
+    const void*                       pNext;
+    VkRenderPassCreateFlags           flags;
+    uint32_t                          attachmentCount;
+    const VkAttachmentDescription*    pAttachments;
+    uint32_t                          subpassCount;
+    const VkSubpassDescription*       pSubpasses;
+    uint32_t                          dependencyCount;
+    const VkSubpassDependency*        pDependencies;
+} VkRenderPassCreateInfo;
+
+typedef struct VkCommandPoolCreateInfo {
+    VkStructureType             sType;
+    const void*                 pNext;
+    VkCommandPoolCreateFlags    flags;
+    uint32_t                    queueFamilyIndex;
+} VkCommandPoolCreateInfo;
+
+typedef struct VkCommandBufferAllocateInfo {
+    VkStructureType         sType;
+    const void*             pNext;
+    VkCommandPool           commandPool;
+    VkCommandBufferLevel    level;
+    uint32_t                commandBufferCount;
+} VkCommandBufferAllocateInfo;
+
+typedef struct VkCommandBufferInheritanceInfo {
+    VkStructureType                  sType;
+    const void*                      pNext;
+    VkRenderPass                     renderPass;
+    uint32_t                         subpass;
+    VkFramebuffer                    framebuffer;
+    VkBool32                         occlusionQueryEnable;
+    VkQueryControlFlags              queryFlags;
+    VkQueryPipelineStatisticFlags    pipelineStatistics;
+} VkCommandBufferInheritanceInfo;
+
+typedef struct VkCommandBufferBeginInfo {
+    VkStructureType                          sType;
+    const void*                              pNext;
+    VkCommandBufferUsageFlags                flags;
+    const VkCommandBufferInheritanceInfo*    pInheritanceInfo;
+} VkCommandBufferBeginInfo;
+
+typedef struct VkBufferCopy {
+    VkDeviceSize    srcOffset;
+    VkDeviceSize    dstOffset;
+    VkDeviceSize    size;
+} VkBufferCopy;
+
+typedef struct VkImageSubresourceLayers {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              mipLevel;
+    uint32_t              baseArrayLayer;
+    uint32_t              layerCount;
+} VkImageSubresourceLayers;
+
+typedef struct VkImageCopy {
+    VkImageSubresourceLayers    srcSubresource;
+    VkOffset3D                  srcOffset;
+    VkImageSubresourceLayers    dstSubresource;
+    VkOffset3D                  dstOffset;
+    VkExtent3D                  extent;
+} VkImageCopy;
+
+typedef struct VkImageBlit {
+    VkImageSubresourceLayers    srcSubresource;
+    VkOffset3D                  srcOffsets[2];
+    VkImageSubresourceLayers    dstSubresource;
+    VkOffset3D                  dstOffsets[2];
+} VkImageBlit;
+
+typedef struct VkBufferImageCopy {
+    VkDeviceSize                bufferOffset;
+    uint32_t                    bufferRowLength;
+    uint32_t                    bufferImageHeight;
+    VkImageSubresourceLayers    imageSubresource;
+    VkOffset3D                  imageOffset;
+    VkExtent3D                  imageExtent;
+} VkBufferImageCopy;
+
+typedef union VkClearColorValue {
+    float       float32[4];
+    int32_t     int32[4];
+    uint32_t    uint32[4];
+} VkClearColorValue;
+
+typedef struct VkClearDepthStencilValue {
+    float       depth;
+    uint32_t    stencil;
+} VkClearDepthStencilValue;
+
+typedef union VkClearValue {
+    VkClearColorValue           color;
+    VkClearDepthStencilValue    depthStencil;
+} VkClearValue;
+
+typedef struct VkClearAttachment {
+    VkImageAspectFlags    aspectMask;
+    uint32_t              colorAttachment;
+    VkClearValue          clearValue;
+} VkClearAttachment;
+
+typedef struct VkClearRect {
+    VkRect2D    rect;
+    uint32_t    baseArrayLayer;
+    uint32_t    layerCount;
+} VkClearRect;
+
+typedef struct VkImageResolve {
+    VkImageSubresourceLayers    srcSubresource;
+    VkOffset3D                  srcOffset;
+    VkImageSubresourceLayers    dstSubresource;
+    VkOffset3D                  dstOffset;
+    VkExtent3D                  extent;
+} VkImageResolve;
+
+typedef struct VkMemoryBarrier {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkAccessFlags      srcAccessMask;
+    VkAccessFlags      dstAccessMask;
+} VkMemoryBarrier;
+
+typedef struct VkBufferMemoryBarrier {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkAccessFlags      srcAccessMask;
+    VkAccessFlags      dstAccessMask;
+    uint32_t           srcQueueFamilyIndex;
+    uint32_t           dstQueueFamilyIndex;
+    VkBuffer           buffer;
+    VkDeviceSize       offset;
+    VkDeviceSize       size;
+} VkBufferMemoryBarrier;
+
+typedef struct VkImageMemoryBarrier {
+    VkStructureType            sType;
+    const void*                pNext;
+    VkAccessFlags              srcAccessMask;
+    VkAccessFlags              dstAccessMask;
+    VkImageLayout              oldLayout;
+    VkImageLayout              newLayout;
+    uint32_t                   srcQueueFamilyIndex;
+    uint32_t                   dstQueueFamilyIndex;
+    VkImage                    image;
+    VkImageSubresourceRange    subresourceRange;
+} VkImageMemoryBarrier;
+
+typedef struct VkRenderPassBeginInfo {
+    VkStructureType        sType;
+    const void*            pNext;
+    VkRenderPass           renderPass;
+    VkFramebuffer          framebuffer;
+    VkRect2D               renderArea;
+    uint32_t               clearValueCount;
+    const VkClearValue*    pClearValues;
+} VkRenderPassBeginInfo;
+
+typedef struct VkDispatchIndirectCommand {
+    uint32_t    x;
+    uint32_t    y;
+    uint32_t    z;
+} VkDispatchIndirectCommand;
+
+typedef struct VkDrawIndexedIndirectCommand {
+    uint32_t    indexCount;
+    uint32_t    instanceCount;
+    uint32_t    firstIndex;
+    int32_t     vertexOffset;
+    uint32_t    firstInstance;
+} VkDrawIndexedIndirectCommand;
+
+typedef struct VkDrawIndirectCommand {
+    uint32_t    vertexCount;
+    uint32_t    instanceCount;
+    uint32_t    firstVertex;
+    uint32_t    firstInstance;
+} VkDrawIndirectCommand;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateInstance)(const VkInstanceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkInstance* pInstance);
+typedef void (VKAPI_PTR *PFN_vkDestroyInstance)(VkInstance instance, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumeratePhysicalDevices)(VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceFeatures)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures* pFeatures);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkFormatProperties* pFormatProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceImageFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags flags, VkImageFormatProperties* pImageFormatProperties);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceProperties)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties* pProperties);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceQueueFamilyProperties)(VkPhysicalDevice physicalDevice, uint32_t* pQueueFamilyPropertyCount, VkQueueFamilyProperties* pQueueFamilyProperties);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceMemoryProperties)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceMemoryProperties* pMemoryProperties);
+typedef PFN_vkVoidFunction (VKAPI_PTR *PFN_vkGetInstanceProcAddr)(VkInstance instance, const char* pName);
+typedef PFN_vkVoidFunction (VKAPI_PTR *PFN_vkGetDeviceProcAddr)(VkDevice device, const char* pName);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDevice)(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDevice* pDevice);
+typedef void (VKAPI_PTR *PFN_vkDestroyDevice)(VkDevice device, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumerateInstanceExtensionProperties)(const char* pLayerName, uint32_t* pPropertyCount, VkExtensionProperties* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumerateDeviceExtensionProperties)(VkPhysicalDevice physicalDevice, const char* pLayerName, uint32_t* pPropertyCount, VkExtensionProperties* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumerateInstanceLayerProperties)(uint32_t* pPropertyCount, VkLayerProperties* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkEnumerateDeviceLayerProperties)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkLayerProperties* pProperties);
+typedef void (VKAPI_PTR *PFN_vkGetDeviceQueue)(VkDevice device, uint32_t queueFamilyIndex, uint32_t queueIndex, VkQueue* pQueue);
+typedef VkResult (VKAPI_PTR *PFN_vkQueueSubmit)(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence);
+typedef VkResult (VKAPI_PTR *PFN_vkQueueWaitIdle)(VkQueue queue);
+typedef VkResult (VKAPI_PTR *PFN_vkDeviceWaitIdle)(VkDevice device);
+typedef VkResult (VKAPI_PTR *PFN_vkAllocateMemory)(VkDevice device, const VkMemoryAllocateInfo* pAllocateInfo, const VkAllocationCallbacks* pAllocator, VkDeviceMemory* pMemory);
+typedef void (VKAPI_PTR *PFN_vkFreeMemory)(VkDevice device, VkDeviceMemory memory, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkMapMemory)(VkDevice device, VkDeviceMemory memory, VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void** ppData);
+typedef void (VKAPI_PTR *PFN_vkUnmapMemory)(VkDevice device, VkDeviceMemory memory);
+typedef VkResult (VKAPI_PTR *PFN_vkFlushMappedMemoryRanges)(VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange* pMemoryRanges);
+typedef VkResult (VKAPI_PTR *PFN_vkInvalidateMappedMemoryRanges)(VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange* pMemoryRanges);
+typedef void (VKAPI_PTR *PFN_vkGetDeviceMemoryCommitment)(VkDevice device, VkDeviceMemory memory, VkDeviceSize* pCommittedMemoryInBytes);
+typedef VkResult (VKAPI_PTR *PFN_vkBindBufferMemory)(VkDevice device, VkBuffer buffer, VkDeviceMemory memory, VkDeviceSize memoryOffset);
+typedef VkResult (VKAPI_PTR *PFN_vkBindImageMemory)(VkDevice device, VkImage image, VkDeviceMemory memory, VkDeviceSize memoryOffset);
+typedef void (VKAPI_PTR *PFN_vkGetBufferMemoryRequirements)(VkDevice device, VkBuffer buffer, VkMemoryRequirements* pMemoryRequirements);
+typedef void (VKAPI_PTR *PFN_vkGetImageMemoryRequirements)(VkDevice device, VkImage image, VkMemoryRequirements* pMemoryRequirements);
+typedef void (VKAPI_PTR *PFN_vkGetImageSparseMemoryRequirements)(VkDevice device, VkImage image, uint32_t* pSparseMemoryRequirementCount, VkSparseImageMemoryRequirements* pSparseMemoryRequirements);
+typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceSparseImageFormatProperties)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkSampleCountFlagBits samples, VkImageUsageFlags usage, VkImageTiling tiling, uint32_t* pPropertyCount, VkSparseImageFormatProperties* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkQueueBindSparse)(VkQueue queue, uint32_t bindInfoCount, const VkBindSparseInfo* pBindInfo, VkFence fence);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateFence)(VkDevice device, const VkFenceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkFence* pFence);
+typedef void (VKAPI_PTR *PFN_vkDestroyFence)(VkDevice device, VkFence fence, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkResetFences)(VkDevice device, uint32_t fenceCount, const VkFence* pFences);
+typedef VkResult (VKAPI_PTR *PFN_vkGetFenceStatus)(VkDevice device, VkFence fence);
+typedef VkResult (VKAPI_PTR *PFN_vkWaitForFences)(VkDevice device, uint32_t fenceCount, const VkFence* pFences, VkBool32 waitAll, uint64_t timeout);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateSemaphore)(VkDevice device, const VkSemaphoreCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSemaphore* pSemaphore);
+typedef void (VKAPI_PTR *PFN_vkDestroySemaphore)(VkDevice device, VkSemaphore semaphore, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateEvent)(VkDevice device, const VkEventCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkEvent* pEvent);
+typedef void (VKAPI_PTR *PFN_vkDestroyEvent)(VkDevice device, VkEvent event, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetEventStatus)(VkDevice device, VkEvent event);
+typedef VkResult (VKAPI_PTR *PFN_vkSetEvent)(VkDevice device, VkEvent event);
+typedef VkResult (VKAPI_PTR *PFN_vkResetEvent)(VkDevice device, VkEvent event);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateQueryPool)(VkDevice device, const VkQueryPoolCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkQueryPool* pQueryPool);
+typedef void (VKAPI_PTR *PFN_vkDestroyQueryPool)(VkDevice device, VkQueryPool queryPool, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetQueryPoolResults)(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, size_t dataSize, void* pData, VkDeviceSize stride, VkQueryResultFlags flags);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateBuffer)(VkDevice device, const VkBufferCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkBuffer* pBuffer);
+typedef void (VKAPI_PTR *PFN_vkDestroyBuffer)(VkDevice device, VkBuffer buffer, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateBufferView)(VkDevice device, const VkBufferViewCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkBufferView* pView);
+typedef void (VKAPI_PTR *PFN_vkDestroyBufferView)(VkDevice device, VkBufferView bufferView, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateImage)(VkDevice device, const VkImageCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkImage* pImage);
+typedef void (VKAPI_PTR *PFN_vkDestroyImage)(VkDevice device, VkImage image, const VkAllocationCallbacks* pAllocator);
+typedef void (VKAPI_PTR *PFN_vkGetImageSubresourceLayout)(VkDevice device, VkImage image, const VkImageSubresource* pSubresource, VkSubresourceLayout* pLayout);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateImageView)(VkDevice device, const VkImageViewCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkImageView* pView);
+typedef void (VKAPI_PTR *PFN_vkDestroyImageView)(VkDevice device, VkImageView imageView, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateShaderModule)(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule);
+typedef void (VKAPI_PTR *PFN_vkDestroyShaderModule)(VkDevice device, VkShaderModule shaderModule, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreatePipelineCache)(VkDevice device, const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache);
+typedef void (VKAPI_PTR *PFN_vkDestroyPipelineCache)(VkDevice device, VkPipelineCache pipelineCache, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPipelineCacheData)(VkDevice device, VkPipelineCache pipelineCache, size_t* pDataSize, void* pData);
+typedef VkResult (VKAPI_PTR *PFN_vkMergePipelineCaches)(VkDevice device, VkPipelineCache dstCache, uint32_t srcCacheCount, const VkPipelineCache* pSrcCaches);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateGraphicsPipelines)(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkGraphicsPipelineCreateInfo* pCreateInfos, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateComputePipelines)(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo* pCreateInfos, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines);
+typedef void (VKAPI_PTR *PFN_vkDestroyPipeline)(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreatePipelineLayout)(VkDevice device, const VkPipelineLayoutCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineLayout* pPipelineLayout);
+typedef void (VKAPI_PTR *PFN_vkDestroyPipelineLayout)(VkDevice device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateSampler)(VkDevice device, const VkSamplerCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSampler* pSampler);
+typedef void (VKAPI_PTR *PFN_vkDestroySampler)(VkDevice device, VkSampler sampler, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDescriptorSetLayout)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDescriptorSetLayout* pSetLayout);
+typedef void (VKAPI_PTR *PFN_vkDestroyDescriptorSetLayout)(VkDevice device, VkDescriptorSetLayout descriptorSetLayout, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDescriptorPool)(VkDevice device, const VkDescriptorPoolCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDescriptorPool* pDescriptorPool);
+typedef void (VKAPI_PTR *PFN_vkDestroyDescriptorPool)(VkDevice device, VkDescriptorPool descriptorPool, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkResetDescriptorPool)(VkDevice device, VkDescriptorPool descriptorPool, VkDescriptorPoolResetFlags flags);
+typedef VkResult (VKAPI_PTR *PFN_vkAllocateDescriptorSets)(VkDevice device, const VkDescriptorSetAllocateInfo* pAllocateInfo, VkDescriptorSet* pDescriptorSets);
+typedef VkResult (VKAPI_PTR *PFN_vkFreeDescriptorSets)(VkDevice device, VkDescriptorPool descriptorPool, uint32_t descriptorSetCount, const VkDescriptorSet* pDescriptorSets);
+typedef void (VKAPI_PTR *PFN_vkUpdateDescriptorSets)(VkDevice device, uint32_t descriptorWriteCount, const VkWriteDescriptorSet* pDescriptorWrites, uint32_t descriptorCopyCount, const VkCopyDescriptorSet* pDescriptorCopies);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateFramebuffer)(VkDevice device, const VkFramebufferCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkFramebuffer* pFramebuffer);
+typedef void (VKAPI_PTR *PFN_vkDestroyFramebuffer)(VkDevice device, VkFramebuffer framebuffer, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateRenderPass)(VkDevice device, const VkRenderPassCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void (VKAPI_PTR *PFN_vkDestroyRenderPass)(VkDevice device, VkRenderPass renderPass, const VkAllocationCallbacks* pAllocator);
+typedef void (VKAPI_PTR *PFN_vkGetRenderAreaGranularity)(VkDevice device, VkRenderPass renderPass, VkExtent2D* pGranularity);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateCommandPool)(VkDevice device, const VkCommandPoolCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkCommandPool* pCommandPool);
+typedef void (VKAPI_PTR *PFN_vkDestroyCommandPool)(VkDevice device, VkCommandPool commandPool, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkResetCommandPool)(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags);
+typedef VkResult (VKAPI_PTR *PFN_vkAllocateCommandBuffers)(VkDevice device, const VkCommandBufferAllocateInfo* pAllocateInfo, VkCommandBuffer* pCommandBuffers);
+typedef void (VKAPI_PTR *PFN_vkFreeCommandBuffers)(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount, const VkCommandBuffer* pCommandBuffers);
+typedef VkResult (VKAPI_PTR *PFN_vkBeginCommandBuffer)(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo* pBeginInfo);
+typedef VkResult (VKAPI_PTR *PFN_vkEndCommandBuffer)(VkCommandBuffer commandBuffer);
+typedef VkResult (VKAPI_PTR *PFN_vkResetCommandBuffer)(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags);
+typedef void (VKAPI_PTR *PFN_vkCmdBindPipeline)(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline);
+typedef void (VKAPI_PTR *PFN_vkCmdSetViewport)(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, const VkViewport* pViewports);
+typedef void (VKAPI_PTR *PFN_vkCmdSetScissor)(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, const VkRect2D* pScissors);
+typedef void (VKAPI_PTR *PFN_vkCmdSetLineWidth)(VkCommandBuffer commandBuffer, float lineWidth);
+typedef void (VKAPI_PTR *PFN_vkCmdSetDepthBias)(VkCommandBuffer commandBuffer, float depthBiasConstantFactor, float depthBiasClamp, float depthBiasSlopeFactor);
+typedef void (VKAPI_PTR *PFN_vkCmdSetBlendConstants)(VkCommandBuffer commandBuffer, const float blendConstants[4]);
+typedef void (VKAPI_PTR *PFN_vkCmdSetDepthBounds)(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds);
+typedef void (VKAPI_PTR *PFN_vkCmdSetStencilCompareMask)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t compareMask);
+typedef void (VKAPI_PTR *PFN_vkCmdSetStencilWriteMask)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t writeMask);
+typedef void (VKAPI_PTR *PFN_vkCmdSetStencilReference)(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t reference);
+typedef void (VKAPI_PTR *PFN_vkCmdBindDescriptorSets)(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout layout, uint32_t firstSet, uint32_t descriptorSetCount, const VkDescriptorSet* pDescriptorSets, uint32_t dynamicOffsetCount, const uint32_t* pDynamicOffsets);
+typedef void (VKAPI_PTR *PFN_vkCmdBindIndexBuffer)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkIndexType indexType);
+typedef void (VKAPI_PTR *PFN_vkCmdBindVertexBuffers)(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount, const VkBuffer* pBuffers, const VkDeviceSize* pOffsets);
+typedef void (VKAPI_PTR *PFN_vkCmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance);
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexed)(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance);
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride);
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride);
+typedef void (VKAPI_PTR *PFN_vkCmdDispatch)(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z);
+typedef void (VKAPI_PTR *PFN_vkCmdDispatchIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyBuffer)(VkCommandBuffer commandBuffer, VkBuffer srcBuffer, VkBuffer dstBuffer, uint32_t regionCount, const VkBufferCopy* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageCopy* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdBlitImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageBlit* pRegions, VkFilter filter);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyBufferToImage)(VkCommandBuffer commandBuffer, VkBuffer srcBuffer, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkBufferImageCopy* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyImageToBuffer)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkBuffer dstBuffer, uint32_t regionCount, const VkBufferImageCopy* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdUpdateBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize dataSize, const uint32_t* pData);
+typedef void (VKAPI_PTR *PFN_vkCmdFillBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize size, uint32_t data);
+typedef void (VKAPI_PTR *PFN_vkCmdClearColorImage)(VkCommandBuffer commandBuffer, VkImage image, VkImageLayout imageLayout, const VkClearColorValue* pColor, uint32_t rangeCount, const VkImageSubresourceRange* pRanges);
+typedef void (VKAPI_PTR *PFN_vkCmdClearDepthStencilImage)(VkCommandBuffer commandBuffer, VkImage image, VkImageLayout imageLayout, const VkClearDepthStencilValue* pDepthStencil, uint32_t rangeCount, const VkImageSubresourceRange* pRanges);
+typedef void (VKAPI_PTR *PFN_vkCmdClearAttachments)(VkCommandBuffer commandBuffer, uint32_t attachmentCount, const VkClearAttachment* pAttachments, uint32_t rectCount, const VkClearRect* pRects);
+typedef void (VKAPI_PTR *PFN_vkCmdResolveImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageResolve* pRegions);
+typedef void (VKAPI_PTR *PFN_vkCmdSetEvent)(VkCommandBuffer commandBuffer, VkEvent event, VkPipelineStageFlags stageMask);
+typedef void (VKAPI_PTR *PFN_vkCmdResetEvent)(VkCommandBuffer commandBuffer, VkEvent event, VkPipelineStageFlags stageMask);
+typedef void (VKAPI_PTR *PFN_vkCmdWaitEvents)(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent* pEvents, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers, uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier* pImageMemoryBarriers);
+typedef void (VKAPI_PTR *PFN_vkCmdPipelineBarrier)(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags, uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers, uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier* pImageMemoryBarriers);
+typedef void (VKAPI_PTR *PFN_vkCmdBeginQuery)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, VkQueryControlFlags flags);
+typedef void (VKAPI_PTR *PFN_vkCmdEndQuery)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query);
+typedef void (VKAPI_PTR *PFN_vkCmdResetQueryPool)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount);
+typedef void (VKAPI_PTR *PFN_vkCmdWriteTimestamp)(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage, VkQueryPool queryPool, uint32_t query);
+typedef void (VKAPI_PTR *PFN_vkCmdCopyQueryPoolResults)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags);
+typedef void (VKAPI_PTR *PFN_vkCmdPushConstants)(VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size, const void* pValues);
+typedef void (VKAPI_PTR *PFN_vkCmdBeginRenderPass)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, VkSubpassContents contents);
+typedef void (VKAPI_PTR *PFN_vkCmdNextSubpass)(VkCommandBuffer commandBuffer, VkSubpassContents contents);
+typedef void (VKAPI_PTR *PFN_vkCmdEndRenderPass)(VkCommandBuffer commandBuffer);
+typedef void (VKAPI_PTR *PFN_vkCmdExecuteCommands)(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer* pCommandBuffers);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance(
+    const VkInstanceCreateInfo*                 pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkInstance*                                 pInstance);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyInstance(
+    VkInstance                                  instance,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDevices(
+    VkInstance                                  instance,
+    uint32_t*                                   pPhysicalDeviceCount,
+    VkPhysicalDevice*                           pPhysicalDevices);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceFeatures*                   pFeatures);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkFormatProperties*                         pFormatProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceImageFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    VkImageTiling                               tiling,
+    VkImageUsageFlags                           usage,
+    VkImageCreateFlags                          flags,
+    VkImageFormatProperties*                    pImageFormatProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceProperties*                 pProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pQueueFamilyPropertyCount,
+    VkQueueFamilyProperties*                    pQueueFamilyProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceMemoryProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceMemoryProperties*           pMemoryProperties);
+
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName);
+
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr(
+    VkDevice                                    device,
+    const char*                                 pName);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDevice(
+    VkPhysicalDevice                            physicalDevice,
+    const VkDeviceCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDevice*                                   pDevice);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDevice(
+    VkDevice                                    device,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceExtensionProperties(
+    const char*                                 pLayerName,
+    uint32_t*                                   pPropertyCount,
+    VkExtensionProperties*                      pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceExtensionProperties(
+    VkPhysicalDevice                            physicalDevice,
+    const char*                                 pLayerName,
+    uint32_t*                                   pPropertyCount,
+    VkExtensionProperties*                      pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceLayerProperties(
+    uint32_t*                                   pPropertyCount,
+    VkLayerProperties*                          pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceLayerProperties(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkLayerProperties*                          pProperties);
+
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceQueue(
+    VkDevice                                    device,
+    uint32_t                                    queueFamilyIndex,
+    uint32_t                                    queueIndex,
+    VkQueue*                                    pQueue);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit(
+    VkQueue                                     queue,
+    uint32_t                                    submitCount,
+    const VkSubmitInfo*                         pSubmits,
+    VkFence                                     fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueWaitIdle(
+    VkQueue                                     queue);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkDeviceWaitIdle(
+    VkDevice                                    device);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateMemory(
+    VkDevice                                    device,
+    const VkMemoryAllocateInfo*                 pAllocateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDeviceMemory*                             pMemory);
+
+VKAPI_ATTR void VKAPI_CALL vkFreeMemory(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkMapMemory(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    VkDeviceSize                                offset,
+    VkDeviceSize                                size,
+    VkMemoryMapFlags                            flags,
+    void**                                      ppData);
+
+VKAPI_ATTR void VKAPI_CALL vkUnmapMemory(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkFlushMappedMemoryRanges(
+    VkDevice                                    device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkInvalidateMappedMemoryRanges(
+    VkDevice                                    device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceMemoryCommitment(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    VkDeviceSize*                               pCommittedMemoryInBytes);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBindBufferMemory(
+    VkDevice                                    device,
+    VkBuffer                                    buffer,
+    VkDeviceMemory                              memory,
+    VkDeviceSize                                memoryOffset);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBindImageMemory(
+    VkDevice                                    device,
+    VkImage                                     image,
+    VkDeviceMemory                              memory,
+    VkDeviceSize                                memoryOffset);
+
+VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements(
+    VkDevice                                    device,
+    VkBuffer                                    buffer,
+    VkMemoryRequirements*                       pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements(
+    VkDevice                                    device,
+    VkImage                                     image,
+    VkMemoryRequirements*                       pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements(
+    VkDevice                                    device,
+    VkImage                                     image,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements*            pSparseMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceSparseImageFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    VkSampleCountFlagBits                       samples,
+    VkImageUsageFlags                           usage,
+    VkImageTiling                               tiling,
+    uint32_t*                                   pPropertyCount,
+    VkSparseImageFormatProperties*              pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueBindSparse(
+    VkQueue                                     queue,
+    uint32_t                                    bindInfoCount,
+    const VkBindSparseInfo*                     pBindInfo,
+    VkFence                                     fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateFence(
+    VkDevice                                    device,
+    const VkFenceCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkFence*                                    pFence);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyFence(
+    VkDevice                                    device,
+    VkFence                                     fence,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetFences(
+    VkDevice                                    device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetFenceStatus(
+    VkDevice                                    device,
+    VkFence                                     fence);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkWaitForFences(
+    VkDevice                                    device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences,
+    VkBool32                                    waitAll,
+    uint64_t                                    timeout);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSemaphore(
+    VkDevice                                    device,
+    const VkSemaphoreCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSemaphore*                                pSemaphore);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySemaphore(
+    VkDevice                                    device,
+    VkSemaphore                                 semaphore,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateEvent(
+    VkDevice                                    device,
+    const VkEventCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkEvent*                                    pEvent);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyEvent(
+    VkDevice                                    device,
+    VkEvent                                     event,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetEventStatus(
+    VkDevice                                    device,
+    VkEvent                                     event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkSetEvent(
+    VkDevice                                    device,
+    VkEvent                                     event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetEvent(
+    VkDevice                                    device,
+    VkEvent                                     event);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateQueryPool(
+    VkDevice                                    device,
+    const VkQueryPoolCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkQueryPool*                                pQueryPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyQueryPool(
+    VkDevice                                    device,
+    VkQueryPool                                 queryPool,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetQueryPoolResults(
+    VkDevice                                    device,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    size_t                                      dataSize,
+    void*                                       pData,
+    VkDeviceSize                                stride,
+    VkQueryResultFlags                          flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateBuffer(
+    VkDevice                                    device,
+    const VkBufferCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkBuffer*                                   pBuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyBuffer(
+    VkDevice                                    device,
+    VkBuffer                                    buffer,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateBufferView(
+    VkDevice                                    device,
+    const VkBufferViewCreateInfo*               pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkBufferView*                               pView);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyBufferView(
+    VkDevice                                    device,
+    VkBufferView                                bufferView,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateImage(
+    VkDevice                                    device,
+    const VkImageCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkImage*                                    pImage);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyImage(
+    VkDevice                                    device,
+    VkImage                                     image,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout(
+    VkDevice                                    device,
+    VkImage                                     image,
+    const VkImageSubresource*                   pSubresource,
+    VkSubresourceLayout*                        pLayout);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateImageView(
+    VkDevice                                    device,
+    const VkImageViewCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkImageView*                                pView);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyImageView(
+    VkDevice                                    device,
+    VkImageView                                 imageView,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateShaderModule(
+    VkDevice                                    device,
+    const VkShaderModuleCreateInfo*             pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkShaderModule*                             pShaderModule);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyShaderModule(
+    VkDevice                                    device,
+    VkShaderModule                              shaderModule,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineCache(
+    VkDevice                                    device,
+    const VkPipelineCacheCreateInfo*            pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipelineCache*                            pPipelineCache);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineCache(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineCacheData(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    size_t*                                     pDataSize,
+    void*                                       pData);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkMergePipelineCaches(
+    VkDevice                                    device,
+    VkPipelineCache                             dstCache,
+    uint32_t                                    srcCacheCount,
+    const VkPipelineCache*                      pSrcCaches);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    createInfoCount,
+    const VkGraphicsPipelineCreateInfo*         pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    createInfoCount,
+    const VkComputePipelineCreateInfo*          pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipeline(
+    VkDevice                                    device,
+    VkPipeline                                  pipeline,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreatePipelineLayout(
+    VkDevice                                    device,
+    const VkPipelineLayoutCreateInfo*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipelineLayout*                           pPipelineLayout);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyPipelineLayout(
+    VkDevice                                    device,
+    VkPipelineLayout                            pipelineLayout,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSampler(
+    VkDevice                                    device,
+    const VkSamplerCreateInfo*                  pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSampler*                                  pSampler);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySampler(
+    VkDevice                                    device,
+    VkSampler                                   sampler,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDescriptorSetLayout(
+    VkDevice                                    device,
+    const VkDescriptorSetLayoutCreateInfo*      pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorSetLayout*                      pSetLayout);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDescriptorSetLayout(
+    VkDevice                                    device,
+    VkDescriptorSetLayout                       descriptorSetLayout,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDescriptorPool(
+    VkDevice                                    device,
+    const VkDescriptorPoolCreateInfo*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorPool*                           pDescriptorPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDescriptorPool(
+    VkDevice                                    device,
+    VkDescriptorPool                            descriptorPool,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetDescriptorPool(
+    VkDevice                                    device,
+    VkDescriptorPool                            descriptorPool,
+    VkDescriptorPoolResetFlags                  flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateDescriptorSets(
+    VkDevice                                    device,
+    const VkDescriptorSetAllocateInfo*          pAllocateInfo,
+    VkDescriptorSet*                            pDescriptorSets);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkFreeDescriptorSets(
+    VkDevice                                    device,
+    VkDescriptorPool                            descriptorPool,
+    uint32_t                                    descriptorSetCount,
+    const VkDescriptorSet*                      pDescriptorSets);
+
+VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSets(
+    VkDevice                                    device,
+    uint32_t                                    descriptorWriteCount,
+    const VkWriteDescriptorSet*                 pDescriptorWrites,
+    uint32_t                                    descriptorCopyCount,
+    const VkCopyDescriptorSet*                  pDescriptorCopies);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateFramebuffer(
+    VkDevice                                    device,
+    const VkFramebufferCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkFramebuffer*                              pFramebuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyFramebuffer(
+    VkDevice                                    device,
+    VkFramebuffer                               framebuffer,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateRenderPass(
+    VkDevice                                    device,
+    const VkRenderPassCreateInfo*               pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkRenderPass*                               pRenderPass);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyRenderPass(
+    VkDevice                                    device,
+    VkRenderPass                                renderPass,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkGetRenderAreaGranularity(
+    VkDevice                                    device,
+    VkRenderPass                                renderPass,
+    VkExtent2D*                                 pGranularity);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateCommandPool(
+    VkDevice                                    device,
+    const VkCommandPoolCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkCommandPool*                              pCommandPool);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyCommandPool(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetCommandPool(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    VkCommandPoolResetFlags                     flags);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateCommandBuffers(
+    VkDevice                                    device,
+    const VkCommandBufferAllocateInfo*          pAllocateInfo,
+    VkCommandBuffer*                            pCommandBuffers);
+
+VKAPI_ATTR void VKAPI_CALL vkFreeCommandBuffers(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCommandBuffers);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkBeginCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    const VkCommandBufferBeginInfo*             pBeginInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer(
+    VkCommandBuffer                             commandBuffer);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkResetCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkCommandBufferResetFlags                   flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindPipeline(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipeline                                  pipeline);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetViewport(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstViewport,
+    uint32_t                                    viewportCount,
+    const VkViewport*                           pViewports);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetScissor(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstScissor,
+    uint32_t                                    scissorCount,
+    const VkRect2D*                             pScissors);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetLineWidth(
+    VkCommandBuffer                             commandBuffer,
+    float                                       lineWidth);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetDepthBias(
+    VkCommandBuffer                             commandBuffer,
+    float                                       depthBiasConstantFactor,
+    float                                       depthBiasClamp,
+    float                                       depthBiasSlopeFactor);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetBlendConstants(
+    VkCommandBuffer                             commandBuffer,
+    const float                                 blendConstants[4]);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetDepthBounds(
+    VkCommandBuffer                             commandBuffer,
+    float                                       minDepthBounds,
+    float                                       maxDepthBounds);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilCompareMask(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    compareMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilWriteMask(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    writeMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetStencilReference(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    reference);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindDescriptorSets(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipelineLayout                            layout,
+    uint32_t                                    firstSet,
+    uint32_t                                    descriptorSetCount,
+    const VkDescriptorSet*                      pDescriptorSets,
+    uint32_t                                    dynamicOffsetCount,
+    const uint32_t*                             pDynamicOffsets);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    VkIndexType                                 indexType);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBindVertexBuffers(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstBinding,
+    uint32_t                                    bindingCount,
+    const VkBuffer*                             pBuffers,
+    const VkDeviceSize*                         pOffsets);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDraw(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    vertexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstVertex,
+    uint32_t                                    firstInstance);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexed(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    indexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstIndex,
+    int32_t                                     vertexOffset,
+    uint32_t                                    firstInstance);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDispatch(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    x,
+    uint32_t                                    y,
+    uint32_t                                    z);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    srcBuffer,
+    VkBuffer                                    dstBuffer,
+    uint32_t                                    regionCount,
+    const VkBufferCopy*                         pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     dstImage,
+    VkImageLayout                               dstImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageCopy*                          pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBlitImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     dstImage,
+    VkImageLayout                               dstImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageBlit*                          pRegions,
+    VkFilter                                    filter);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyBufferToImage(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    srcBuffer,
+    VkImage                                     dstImage,
+    VkImageLayout                               dstImageLayout,
+    uint32_t                                    regionCount,
+    const VkBufferImageCopy*                    pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyImageToBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkBuffer                                    dstBuffer,
+    uint32_t                                    regionCount,
+    const VkBufferImageCopy*                    pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdUpdateBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                dataSize,
+    const uint32_t*                             pData);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdFillBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                size,
+    uint32_t                                    data);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearColorImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     image,
+    VkImageLayout                               imageLayout,
+    const VkClearColorValue*                    pColor,
+    uint32_t                                    rangeCount,
+    const VkImageSubresourceRange*              pRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearDepthStencilImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     image,
+    VkImageLayout                               imageLayout,
+    const VkClearDepthStencilValue*             pDepthStencil,
+    uint32_t                                    rangeCount,
+    const VkImageSubresourceRange*              pRanges);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdClearAttachments(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    attachmentCount,
+    const VkClearAttachment*                    pAttachments,
+    uint32_t                                    rectCount,
+    const VkClearRect*                          pRects);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResolveImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     dstImage,
+    VkImageLayout                               dstImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageResolve*                       pRegions);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdSetEvent(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     event,
+    VkPipelineStageFlags                        stageMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResetEvent(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     event,
+    VkPipelineStageFlags                        stageMask);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdWaitEvents(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    eventCount,
+    const VkEvent*                              pEvents,
+    VkPipelineStageFlags                        srcStageMask,
+    VkPipelineStageFlags                        dstStageMask,
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlags                        srcStageMask,
+    VkPipelineStageFlags                        dstStageMask,
+    VkDependencyFlags                           dependencyFlags,
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBeginQuery(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query,
+    VkQueryControlFlags                         flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdEndQuery(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdResetQueryPool(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdWriteTimestamp(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlagBits                     pipelineStage,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdCopyQueryPoolResults(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                stride,
+    VkQueryResultFlags                          flags);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdPushConstants(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineLayout                            layout,
+    VkShaderStageFlags                          stageFlags,
+    uint32_t                                    offset,
+    uint32_t                                    size,
+    const void*                                 pValues);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdBeginRenderPass(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderPassBeginInfo*                pRenderPassBegin,
+    VkSubpassContents                           contents);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdNextSubpass(
+    VkCommandBuffer                             commandBuffer,
+    VkSubpassContents                           contents);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdEndRenderPass(
+    VkCommandBuffer                             commandBuffer);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdExecuteCommands(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCommandBuffers);
+#endif
+
+#define VK_KHR_surface 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSurfaceKHR)
+
+#define VK_KHR_SURFACE_SPEC_VERSION       24
+#define VK_KHR_SURFACE_EXTENSION_NAME     "VK_KHR_surface"
+
+
+typedef enum VkColorSpaceKHR {
+    VK_COLORSPACE_SRGB_NONLINEAR_KHR = 0,
+    VK_COLORSPACE_BEGIN_RANGE = VK_COLORSPACE_SRGB_NONLINEAR_KHR,
+    VK_COLORSPACE_END_RANGE = VK_COLORSPACE_SRGB_NONLINEAR_KHR,
+    VK_COLORSPACE_RANGE_SIZE = (VK_COLORSPACE_SRGB_NONLINEAR_KHR - VK_COLORSPACE_SRGB_NONLINEAR_KHR + 1),
+    VK_COLORSPACE_MAX_ENUM = 0x7FFFFFFF
+} VkColorSpaceKHR;
+
+typedef enum VkPresentModeKHR {
+    VK_PRESENT_MODE_IMMEDIATE_KHR = 0,
+    VK_PRESENT_MODE_MAILBOX_KHR = 1,
+    VK_PRESENT_MODE_FIFO_KHR = 2,
+    VK_PRESENT_MODE_FIFO_RELAXED_KHR = 3,
+    VK_PRESENT_MODE_BEGIN_RANGE = VK_PRESENT_MODE_IMMEDIATE_KHR,
+    VK_PRESENT_MODE_END_RANGE = VK_PRESENT_MODE_FIFO_RELAXED_KHR,
+    VK_PRESENT_MODE_RANGE_SIZE = (VK_PRESENT_MODE_FIFO_RELAXED_KHR - VK_PRESENT_MODE_IMMEDIATE_KHR + 1),
+    VK_PRESENT_MODE_MAX_ENUM = 0x7FFFFFFF
+} VkPresentModeKHR;
+
+
+typedef enum VkSurfaceTransformFlagBitsKHR {
+    VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR = 0x00000001,
+    VK_SURFACE_TRANSFORM_ROTATE_90_BIT_KHR = 0x00000002,
+    VK_SURFACE_TRANSFORM_ROTATE_180_BIT_KHR = 0x00000004,
+    VK_SURFACE_TRANSFORM_ROTATE_270_BIT_KHR = 0x00000008,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_BIT_KHR = 0x00000010,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_90_BIT_KHR = 0x00000020,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_180_BIT_KHR = 0x00000040,
+    VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_270_BIT_KHR = 0x00000080,
+    VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR = 0x00000100,
+} VkSurfaceTransformFlagBitsKHR;
+typedef VkFlags VkSurfaceTransformFlagsKHR;
+
+typedef enum VkCompositeAlphaFlagBitsKHR {
+    VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR = 0x00000001,
+    VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR = 0x00000002,
+    VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR = 0x00000004,
+    VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR = 0x00000008,
+} VkCompositeAlphaFlagBitsKHR;
+typedef VkFlags VkCompositeAlphaFlagsKHR;
+
+typedef struct VkSurfaceCapabilitiesKHR {
+    uint32_t                         minImageCount;
+    uint32_t                         maxImageCount;
+    VkExtent2D                       currentExtent;
+    VkExtent2D                       minImageExtent;
+    VkExtent2D                       maxImageExtent;
+    uint32_t                         maxImageArrayLayers;
+    VkSurfaceTransformFlagsKHR       supportedTransforms;
+    VkSurfaceTransformFlagBitsKHR    currentTransform;
+    VkCompositeAlphaFlagsKHR         supportedCompositeAlpha;
+    VkImageUsageFlags                supportedUsageFlags;
+} VkSurfaceCapabilitiesKHR;
+
+typedef struct VkSurfaceFormatKHR {
+    VkFormat           format;
+    VkColorSpaceKHR    colorSpace;
+} VkSurfaceFormatKHR;
+
+
+typedef void (VKAPI_PTR *PFN_vkDestroySurfaceKHR)(VkInstance instance, VkSurfaceKHR surface, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, VkSurfaceKHR surface, VkBool32* pSupported);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR* pSurfaceCapabilities);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pSurfaceFormatCount, VkSurfaceFormatKHR* pSurfaceFormats);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pPresentModeCount, VkPresentModeKHR* pPresentModes);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR(
+    VkInstance                                  instance,
+    VkSurfaceKHR                                surface,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    VkSurfaceKHR                                surface,
+    VkBool32*                                   pSupported);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                surface,
+    VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                surface,
+    uint32_t*                                   pSurfaceFormatCount,
+    VkSurfaceFormatKHR*                         pSurfaceFormats);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                surface,
+    uint32_t*                                   pPresentModeCount,
+    VkPresentModeKHR*                           pPresentModes);
+#endif
+
+#define VK_KHR_swapchain 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSwapchainKHR)
+
+#define VK_KHR_SWAPCHAIN_SPEC_VERSION     67
+#define VK_KHR_SWAPCHAIN_EXTENSION_NAME   "VK_KHR_swapchain"
+
+typedef VkFlags VkSwapchainCreateFlagsKHR;
+
+typedef struct VkSwapchainCreateInfoKHR {
+    VkStructureType                  sType;
+    const void*                      pNext;
+    VkSwapchainCreateFlagsKHR        flags;
+    VkSurfaceKHR                     surface;
+    uint32_t                         minImageCount;
+    VkFormat                         imageFormat;
+    VkColorSpaceKHR                  imageColorSpace;
+    VkExtent2D                       imageExtent;
+    uint32_t                         imageArrayLayers;
+    VkImageUsageFlags                imageUsage;
+    VkSharingMode                    imageSharingMode;
+    uint32_t                         queueFamilyIndexCount;
+    const uint32_t*                  pQueueFamilyIndices;
+    VkSurfaceTransformFlagBitsKHR    preTransform;
+    VkCompositeAlphaFlagBitsKHR      compositeAlpha;
+    VkPresentModeKHR                 presentMode;
+    VkBool32                         clipped;
+    VkSwapchainKHR                   oldSwapchain;
+} VkSwapchainCreateInfoKHR;
+
+typedef struct VkPresentInfoKHR {
+    VkStructureType          sType;
+    const void*              pNext;
+    uint32_t                 waitSemaphoreCount;
+    const VkSemaphore*       pWaitSemaphores;
+    uint32_t                 swapchainCount;
+    const VkSwapchainKHR*    pSwapchains;
+    const uint32_t*          pImageIndices;
+    VkResult*                pResults;
+} VkPresentInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateSwapchainKHR)(VkDevice device, const VkSwapchainCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSwapchainKHR* pSwapchain);
+typedef void (VKAPI_PTR *PFN_vkDestroySwapchainKHR)(VkDevice device, VkSwapchainKHR swapchain, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetSwapchainImagesKHR)(VkDevice device, VkSwapchainKHR swapchain, uint32_t* pSwapchainImageCount, VkImage* pSwapchainImages);
+typedef VkResult (VKAPI_PTR *PFN_vkAcquireNextImageKHR)(VkDevice device, VkSwapchainKHR swapchain, uint64_t timeout, VkSemaphore semaphore, VkFence fence, uint32_t* pImageIndex);
+typedef VkResult (VKAPI_PTR *PFN_vkQueuePresentKHR)(VkQueue queue, const VkPresentInfoKHR* pPresentInfo);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSwapchainKHR(
+    VkDevice                                    device,
+    const VkSwapchainCreateInfoKHR*             pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSwapchainKHR*                             pSwapchain);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroySwapchainKHR(
+    VkDevice                                    device,
+    VkSwapchainKHR                              swapchain,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainImagesKHR(
+    VkDevice                                    device,
+    VkSwapchainKHR                              swapchain,
+    uint32_t*                                   pSwapchainImageCount,
+    VkImage*                                    pSwapchainImages);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImageKHR(
+    VkDevice                                    device,
+    VkSwapchainKHR                              swapchain,
+    uint64_t                                    timeout,
+    VkSemaphore                                 semaphore,
+    VkFence                                     fence,
+    uint32_t*                                   pImageIndex);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR(
+    VkQueue                                     queue,
+    const VkPresentInfoKHR*                     pPresentInfo);
+#endif
+
+#define VK_KHR_display 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDisplayKHR)
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDisplayModeKHR)
+
+#define VK_KHR_DISPLAY_SPEC_VERSION       21
+#define VK_KHR_DISPLAY_EXTENSION_NAME     "VK_KHR_display"
+
+
+typedef enum VkDisplayPlaneAlphaFlagBitsKHR {
+    VK_DISPLAY_PLANE_ALPHA_OPAQUE_BIT_KHR = 0x00000001,
+    VK_DISPLAY_PLANE_ALPHA_GLOBAL_BIT_KHR = 0x00000002,
+    VK_DISPLAY_PLANE_ALPHA_PER_PIXEL_BIT_KHR = 0x00000004,
+    VK_DISPLAY_PLANE_ALPHA_PER_PIXEL_PREMULTIPLIED_BIT_KHR = 0x00000008,
+} VkDisplayPlaneAlphaFlagBitsKHR;
+typedef VkFlags VkDisplayModeCreateFlagsKHR;
+typedef VkFlags VkDisplayPlaneAlphaFlagsKHR;
+typedef VkFlags VkDisplaySurfaceCreateFlagsKHR;
+
+typedef struct VkDisplayPropertiesKHR {
+    VkDisplayKHR                  display;
+    const char*                   displayName;
+    VkExtent2D                    physicalDimensions;
+    VkExtent2D                    physicalResolution;
+    VkSurfaceTransformFlagsKHR    supportedTransforms;
+    VkBool32                      planeReorderPossible;
+    VkBool32                      persistentContent;
+} VkDisplayPropertiesKHR;
+
+typedef struct VkDisplayModeParametersKHR {
+    VkExtent2D    visibleRegion;
+    uint32_t      refreshRate;
+} VkDisplayModeParametersKHR;
+
+typedef struct VkDisplayModePropertiesKHR {
+    VkDisplayModeKHR              displayMode;
+    VkDisplayModeParametersKHR    parameters;
+} VkDisplayModePropertiesKHR;
+
+typedef struct VkDisplayModeCreateInfoKHR {
+    VkStructureType                sType;
+    const void*                    pNext;
+    VkDisplayModeCreateFlagsKHR    flags;
+    VkDisplayModeParametersKHR     parameters;
+} VkDisplayModeCreateInfoKHR;
+
+typedef struct VkDisplayPlaneCapabilitiesKHR {
+    VkDisplayPlaneAlphaFlagsKHR    supportedAlpha;
+    VkOffset2D                     minSrcPosition;
+    VkOffset2D                     maxSrcPosition;
+    VkExtent2D                     minSrcExtent;
+    VkExtent2D                     maxSrcExtent;
+    VkOffset2D                     minDstPosition;
+    VkOffset2D                     maxDstPosition;
+    VkExtent2D                     minDstExtent;
+    VkExtent2D                     maxDstExtent;
+} VkDisplayPlaneCapabilitiesKHR;
+
+typedef struct VkDisplayPlanePropertiesKHR {
+    VkDisplayKHR    currentDisplay;
+    uint32_t        currentStackIndex;
+} VkDisplayPlanePropertiesKHR;
+
+typedef struct VkDisplaySurfaceCreateInfoKHR {
+    VkStructureType                   sType;
+    const void*                       pNext;
+    VkDisplaySurfaceCreateFlagsKHR    flags;
+    VkDisplayModeKHR                  displayMode;
+    uint32_t                          planeIndex;
+    uint32_t                          planeStackIndex;
+    VkSurfaceTransformFlagBitsKHR     transform;
+    float                             globalAlpha;
+    VkDisplayPlaneAlphaFlagBitsKHR    alphaMode;
+    VkExtent2D                        imageExtent;
+} VkDisplaySurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceDisplayPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkDisplayPropertiesKHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceDisplayPlanePropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkDisplayPlanePropertiesKHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetDisplayPlaneSupportedDisplaysKHR)(VkPhysicalDevice physicalDevice, uint32_t planeIndex, uint32_t* pDisplayCount, VkDisplayKHR* pDisplays);
+typedef VkResult (VKAPI_PTR *PFN_vkGetDisplayModePropertiesKHR)(VkPhysicalDevice physicalDevice, VkDisplayKHR display, uint32_t* pPropertyCount, VkDisplayModePropertiesKHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDisplayModeKHR)(VkPhysicalDevice physicalDevice, VkDisplayKHR display, const VkDisplayModeCreateInfoKHR*pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDisplayModeKHR* pMode);
+typedef VkResult (VKAPI_PTR *PFN_vkGetDisplayPlaneCapabilitiesKHR)(VkPhysicalDevice physicalDevice, VkDisplayModeKHR mode, uint32_t planeIndex, VkDisplayPlaneCapabilitiesKHR* pCapabilities);
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDisplayPlaneSurfaceKHR)(VkInstance instance, const VkDisplaySurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceDisplayPropertiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayPropertiesKHR*                     pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceDisplayPlanePropertiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayPlanePropertiesKHR*                pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayPlaneSupportedDisplaysKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    planeIndex,
+    uint32_t*                                   pDisplayCount,
+    VkDisplayKHR*                               pDisplays);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayModePropertiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkDisplayKHR                                display,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayModePropertiesKHR*                 pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDisplayModeKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkDisplayKHR                                display,
+    const VkDisplayModeCreateInfoKHR*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDisplayModeKHR*                           pMode);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayPlaneCapabilitiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkDisplayModeKHR                            mode,
+    uint32_t                                    planeIndex,
+    VkDisplayPlaneCapabilitiesKHR*              pCapabilities);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDisplayPlaneSurfaceKHR(
+    VkInstance                                  instance,
+    const VkDisplaySurfaceCreateInfoKHR*        pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+#endif
+
+#define VK_KHR_display_swapchain 1
+#define VK_KHR_DISPLAY_SWAPCHAIN_SPEC_VERSION 9
+#define VK_KHR_DISPLAY_SWAPCHAIN_EXTENSION_NAME "VK_KHR_display_swapchain"
+
+typedef struct VkDisplayPresentInfoKHR {
+    VkStructureType    sType;
+    const void*        pNext;
+    VkRect2D           srcRect;
+    VkRect2D           dstRect;
+    VkBool32           persistent;
+} VkDisplayPresentInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateSharedSwapchainsKHR)(VkDevice device, uint32_t swapchainCount, const VkSwapchainCreateInfoKHR* pCreateInfos, const VkAllocationCallbacks* pAllocator, VkSwapchainKHR* pSwapchains);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSharedSwapchainsKHR(
+    VkDevice                                    device,
+    uint32_t                                    swapchainCount,
+    const VkSwapchainCreateInfoKHR*             pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSwapchainKHR*                             pSwapchains);
+#endif
+
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+#define VK_KHR_xlib_surface 1
+#include <X11/Xlib.h>
+
+#define VK_KHR_XLIB_SURFACE_SPEC_VERSION  6
+#define VK_KHR_XLIB_SURFACE_EXTENSION_NAME "VK_KHR_xlib_surface"
+
+typedef VkFlags VkXlibSurfaceCreateFlagsKHR;
+
+typedef struct VkXlibSurfaceCreateInfoKHR {
+    VkStructureType                sType;
+    const void*                    pNext;
+    VkXlibSurfaceCreateFlagsKHR    flags;
+    Display*                       dpy;
+    Window                         window;
+} VkXlibSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateXlibSurfaceKHR)(VkInstance instance, const VkXlibSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceXlibPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, Display* dpy, VisualID visualID);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateXlibSurfaceKHR(
+    VkInstance                                  instance,
+    const VkXlibSurfaceCreateInfoKHR*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXlibPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    Display*                                    dpy,
+    VisualID                                    visualID);
+#endif
+#endif /* VK_USE_PLATFORM_XLIB_KHR */
+
+#ifdef VK_USE_PLATFORM_XCB_KHR
+#define VK_KHR_xcb_surface 1
+#include <xcb/xcb.h>
+
+#define VK_KHR_XCB_SURFACE_SPEC_VERSION   6
+#define VK_KHR_XCB_SURFACE_EXTENSION_NAME "VK_KHR_xcb_surface"
+
+typedef VkFlags VkXcbSurfaceCreateFlagsKHR;
+
+typedef struct VkXcbSurfaceCreateInfoKHR {
+    VkStructureType               sType;
+    const void*                   pNext;
+    VkXcbSurfaceCreateFlagsKHR    flags;
+    xcb_connection_t*             connection;
+    xcb_window_t                  window;
+} VkXcbSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateXcbSurfaceKHR)(VkInstance instance, const VkXcbSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceXcbPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, xcb_connection_t* connection, xcb_visualid_t visual_id);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateXcbSurfaceKHR(
+    VkInstance                                  instance,
+    const VkXcbSurfaceCreateInfoKHR*            pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXcbPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    xcb_connection_t*                           connection,
+    xcb_visualid_t                              visual_id);
+#endif
+#endif /* VK_USE_PLATFORM_XCB_KHR */
+
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+#define VK_KHR_wayland_surface 1
+#include <wayland-client.h>
+
+#define VK_KHR_WAYLAND_SURFACE_SPEC_VERSION 5
+#define VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME "VK_KHR_wayland_surface"
+
+typedef VkFlags VkWaylandSurfaceCreateFlagsKHR;
+
+typedef struct VkWaylandSurfaceCreateInfoKHR {
+    VkStructureType                   sType;
+    const void*                       pNext;
+    VkWaylandSurfaceCreateFlagsKHR    flags;
+    struct wl_display*                display;
+    struct wl_surface*                surface;
+} VkWaylandSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateWaylandSurfaceKHR)(VkInstance instance, const VkWaylandSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceWaylandPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, struct wl_display* display);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateWaylandSurfaceKHR(
+    VkInstance                                  instance,
+    const VkWaylandSurfaceCreateInfoKHR*        pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceWaylandPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    struct wl_display*                          display);
+#endif
+#endif /* VK_USE_PLATFORM_WAYLAND_KHR */
+
+#ifdef VK_USE_PLATFORM_MIR_KHR
+#define VK_KHR_mir_surface 1
+#include <mir_toolkit/client_types.h>
+
+#define VK_KHR_MIR_SURFACE_SPEC_VERSION   4
+#define VK_KHR_MIR_SURFACE_EXTENSION_NAME "VK_KHR_mir_surface"
+
+typedef VkFlags VkMirSurfaceCreateFlagsKHR;
+
+typedef struct VkMirSurfaceCreateInfoKHR {
+    VkStructureType               sType;
+    const void*                   pNext;
+    VkMirSurfaceCreateFlagsKHR    flags;
+    MirConnection*                connection;
+    MirSurface*                   mirSurface;
+} VkMirSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateMirSurfaceKHR)(VkInstance instance, const VkMirSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceMirPresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, MirConnection* connection);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateMirSurfaceKHR(
+    VkInstance                                  instance,
+    const VkMirSurfaceCreateInfoKHR*            pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceMirPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    MirConnection*                              connection);
+#endif
+#endif /* VK_USE_PLATFORM_MIR_KHR */
+
+#ifdef VK_USE_PLATFORM_ANDROID_KHR
+#define VK_KHR_android_surface 1
+#include <android/native_window.h>
+
+#define VK_KHR_ANDROID_SURFACE_SPEC_VERSION 5
+#define VK_KHR_ANDROID_SURFACE_EXTENSION_NAME "VK_KHR_android_surface"
+
+typedef VkFlags VkAndroidSurfaceCreateFlagsKHR;
+
+typedef struct VkAndroidSurfaceCreateInfoKHR {
+    VkStructureType                   sType;
+    const void*                       pNext;
+    VkAndroidSurfaceCreateFlagsKHR    flags;
+    ANativeWindow*                    window;
+} VkAndroidSurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateAndroidSurfaceKHR)(VkInstance instance, const VkAndroidSurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateAndroidSurfaceKHR(
+    VkInstance                                  instance,
+    const VkAndroidSurfaceCreateInfoKHR*        pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+#endif
+#endif /* VK_USE_PLATFORM_ANDROID_KHR */
+
+#ifdef VK_USE_PLATFORM_WIN32_KHR
+#define VK_KHR_win32_surface 1
+#include <windows.h>
+
+#define VK_KHR_WIN32_SURFACE_SPEC_VERSION 5
+#define VK_KHR_WIN32_SURFACE_EXTENSION_NAME "VK_KHR_win32_surface"
+
+typedef VkFlags VkWin32SurfaceCreateFlagsKHR;
+
+typedef struct VkWin32SurfaceCreateInfoKHR {
+    VkStructureType                 sType;
+    const void*                     pNext;
+    VkWin32SurfaceCreateFlagsKHR    flags;
+    HINSTANCE                       hinstance;
+    HWND                            hwnd;
+} VkWin32SurfaceCreateInfoKHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateWin32SurfaceKHR)(VkInstance instance, const VkWin32SurfaceCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface);
+typedef VkBool32 (VKAPI_PTR *PFN_vkGetPhysicalDeviceWin32PresentationSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateWin32SurfaceKHR(
+    VkInstance                                  instance,
+    const VkWin32SurfaceCreateInfoKHR*          pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface);
+
+VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceWin32PresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex);
+#endif
+#endif /* VK_USE_PLATFORM_WIN32_KHR */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/vulkan/vulkan_intel.h b/include/vulkan/vulkan_intel.h

new file mode 100644 (file)

index 0000000..1f77128
--- /dev/null
+++ b/include/vulkan/vulkan_intel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __VULKAN_INTEL_H__
+#define __VULKAN_INTEL_H__
+
+#include "vulkan.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+#define VK_STRUCTURE_TYPE_DMA_BUF_IMAGE_CREATE_INFO_INTEL 1024
+typedef struct VkDmaBufImageCreateInfo_
+{
+    VkStructureType                             sType;                      // Must be VK_STRUCTURE_TYPE_DMA_BUF_IMAGE_CREATE_INFO_INTEL
+    const void*                                 pNext;                      // Pointer to next structure.
+    int                                         fd;
+    VkFormat                                    format;
+    VkExtent3D                                  extent;         // Depth must be 1
+    uint32_t                                    strideInBytes;
+} VkDmaBufImageCreateInfo;
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateDmaBufImageINTEL)(VkDevice device, const VkDmaBufImageCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDeviceMemory* pMem, VkImage* pImage);
+
+#ifdef VK_PROTOTYPES
+
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDmaBufImageINTEL(
+    VkDevice                                    _device,
+    const VkDmaBufImageCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDeviceMemory*                             pMem,
+    VkImage*                                    pImage);
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // __VULKAN_INTEL_H__
diff --git a/scons/gallium.py b/scons/gallium.py

index 46dbf0e..6dcd952 100755 (executable)
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -300,7 +300,7 @@ def generate(env):
  
      # C preprocessor options
      cppdefines = []
-    cppdefines += ['__STDC_LIMIT_MACROS']
+    cppdefines += ['__STDC_LIMIT_MACROS', '__STDC_CONSTANT_MACROS']
      if env['build'] in ('debug', 'checked'):
          cppdefines += ['DEBUG']
      else:
diff --git a/src/Makefile.am b/src/Makefile.am

index 0d49bcd..9f51e44 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -53,6 +53,11 @@ EXTRA_DIST = \
  AM_CFLAGS = $(VISIBILITY_CFLAGS)
  AM_CXXFLAGS = $(VISIBILITY_CXXFLAGS)
  
+if HAVE_VULKAN
+SUBDIRS += isl
+SUBDIRS += vulkan
+endif
+
  AM_CPPFLAGS = \
         -I$(top_srcdir)/include/ \
         -I$(top_srcdir)/src/mapi/ \
diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am

index 7b026b5..bcdf297 100644 (file)
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -1,11 +1,10 @@
  include Makefile.sources
  include $(top_srcdir)/src/gallium/Automake.inc
  
-noinst_LTLIBRARIES = libgallium.la
+noinst_LTLIBRARIES = libgallium_nir.la
  
  AM_CFLAGS = \
         -I$(top_srcdir)/src/loader \
-       -I$(top_builddir)/src/glsl/nir \
         -I$(top_srcdir)/src/gallium/auxiliary/util \
         $(GALLIUM_CFLAGS) \
         $(VISIBILITY_CFLAGS) \
@@ -15,11 +14,24 @@ AM_CXXFLAGS = \
         $(VISIBILITY_CXXFLAGS) \
         $(MSVC2008_COMPAT_CXXFLAGS)
  
+libgallium_nir_la_SOURCES = \
+       $(NIR_SOURCES)
+
+libgallium_nir_la_CFLAGS = \
+       -I$(top_builddir)/src/glsl/nir \
+       $(GALLIUM_CFLAGS) \
+       $(VISIBILITY_CFLAGS) \
+       $(MSVC2013_COMPAT_CFLAGS)
+
+noinst_LTLIBRARIES += libgallium.la
+
  libgallium_la_SOURCES = \
         $(C_SOURCES) \
-       $(NIR_SOURCES) \
         $(GENERATED_SOURCES)
  
+libgallium_la_LIBADD = \
+       libgallium_nir.la
+
  if HAVE_MESA_LLVM
  
  AM_CFLAGS += \
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources

index d92da3d..5325f97 100644 (file)
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -252,7 +252,6 @@ C_SOURCES := \
         util/u_helpers.h \
         util/u_index_modify.c \
         util/u_index_modify.h \
-       util/u_init.h \
         util/u_inlines.h \
         util/u_keymap.c \
         util/u_keymap.h \
diff --git a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h

index 779b237..34add82 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
@@ -91,34 +91,34 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
           }
  
           for (i = 0; i < 4; i++) {
-            out->clip[i] = clipvertex[i];
-            out->pre_clip_pos[i] = position[i];
+            out->clip_pos[i] = position[i];
           }
  
+         /* Be careful with NaNs. Comparisons must be true for them. */
           /* Do the hardwired planes first:
            */
           if (flags & DO_CLIP_XY_GUARD_BAND) {
-            if (-0.50 * position[0] + position[3] < 0) mask |= (1<<0);
-            if ( 0.50 * position[0] + position[3] < 0) mask |= (1<<1);
-            if (-0.50 * position[1] + position[3] < 0) mask |= (1<<2);
-            if ( 0.50 * position[1] + position[3] < 0) mask |= (1<<3);
+            if (!(-0.50 * position[0] + position[3] >= 0)) mask |= (1<<0);
+            if (!( 0.50 * position[0] + position[3] >= 0)) mask |= (1<<1);
+            if (!(-0.50 * position[1] + position[3] >= 0)) mask |= (1<<2);
+            if (!( 0.50 * position[1] + position[3] >= 0)) mask |= (1<<3);
           }
           else if (flags & DO_CLIP_XY) {
-            if (-position[0] + position[3] < 0) mask |= (1<<0);
-            if ( position[0] + position[3] < 0) mask |= (1<<1);
-            if (-position[1] + position[3] < 0) mask |= (1<<2);
-            if ( position[1] + position[3] < 0) mask |= (1<<3);
+            if (!(-position[0] + position[3] >= 0)) mask |= (1<<0);
+            if (!( position[0] + position[3] >= 0)) mask |= (1<<1);
+            if (!(-position[1] + position[3] >= 0)) mask |= (1<<2);
+            if (!( position[1] + position[3] >= 0)) mask |= (1<<3);
           }
  
           /* Clip Z planes according to full cube, half cube or none.
            */
           if (flags & DO_CLIP_FULL_Z) {
-            if ( position[2] + position[3] < 0) mask |= (1<<4);
-            if (-position[2] + position[3] < 0) mask |= (1<<5);
+            if (!( position[2] + position[3] >= 0)) mask |= (1<<4);
+            if (!(-position[2] + position[3] >= 0)) mask |= (1<<5);
           }
           else if (flags & DO_CLIP_HALF_Z) {
-            if ( position[2]               < 0) mask |= (1<<4);
-            if (-position[2] + position[3] < 0) mask |= (1<<5);
+            if (!( position[2]               >= 0)) mask |= (1<<4);
+            if (!(-position[2] + position[3] >= 0)) mask |= (1<<5);
           }
  
           if (flags & DO_CLIP_USER) {
@@ -137,7 +137,6 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
                 if (have_cd && num_written_clipdistance) {
                    float clipdist;
                    i = plane_idx - 6;
-                  out->have_clipdist = 1;
                    /* first four clip distance in first vector etc. */
                    if (i < 4)
                       clipdist = out->data[cd[0]][i];
@@ -146,7 +145,7 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
                    if (clipdist < 0 || util_is_inf_or_nan(clipdist))
                       mask |= 1 << plane_idx;
                 } else {
-                  if (dot4(clipvertex, plane[plane_idx]) < 0)
+                  if (!(dot4(clipvertex, plane[plane_idx]) >= 0))
                       mask |= 1 << plane_idx;
                 }
              }
@@ -192,7 +191,6 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
  
        out = (struct vertex_header *)( (char *)out + info->stride );
     }
-
     return need_pipeline != 0;
  }
  
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c

index ee97424..142d78a 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -188,6 +188,7 @@ create_jit_sampler_type(struct gallivm_state *gallivm, const char *struct_name)
     sampler_type = LLVMStructTypeInContext(gallivm->context, elem_types,
                                            Elements(elem_types), 0);
  
+   (void) target; /* silence unused var warning for non-debug build */
     LP_CHECK_MEMBER_OFFSET(struct draw_jit_sampler, min_lod,
                            target, sampler_type,
                            DRAW_JIT_SAMPLER_MIN_LOD);
@@ -234,6 +235,8 @@ create_jit_context_type(struct gallivm_state *gallivm,
                                   PIPE_MAX_SAMPLERS); /* samplers */
     context_type = LLVMStructTypeInContext(gallivm->context, elem_types,
                                            Elements(elem_types), 0);
+
+   (void) target; /* silence unused var warning for non-debug build */
     LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, vs_constants,
                            target, context_type, DRAW_JIT_CTX_CONSTANTS);
     LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, num_vs_constants,
@@ -375,15 +378,14 @@ static LLVMTypeRef
  create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems)
  {
     LLVMTargetDataRef target = gallivm->target;
-   LLVMTypeRef elem_types[4];
+   LLVMTypeRef elem_types[3];
     LLVMTypeRef vertex_header;
     char struct_name[24];
  
     util_snprintf(struct_name, 23, "vertex_header%d", data_elems);
  
     elem_types[DRAW_JIT_VERTEX_VERTEX_ID]  = LLVMIntTypeInContext(gallivm->context, 32);
-   elem_types[DRAW_JIT_VERTEX_CLIP]  = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4);
-   elem_types[DRAW_JIT_VERTEX_PRE_CLIP_POS]  = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4);
+   elem_types[DRAW_JIT_VERTEX_CLIP_POS]  = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4);
     elem_types[DRAW_JIT_VERTEX_DATA]  = LLVMArrayType(elem_types[1], data_elems);
  
     vertex_header = LLVMStructTypeInContext(gallivm->context, elem_types,
@@ -403,12 +405,10 @@ create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems)
        target, vertex_header,
        DRAW_JIT_VERTEX_VERTEX_ID);
     */
-   LP_CHECK_MEMBER_OFFSET(struct vertex_header, clip,
-                          target, vertex_header,
-                          DRAW_JIT_VERTEX_CLIP);
-   LP_CHECK_MEMBER_OFFSET(struct vertex_header, pre_clip_pos,
+   (void) target; /* silence unused var warning for non-debug build */
+   LP_CHECK_MEMBER_OFFSET(struct vertex_header, clip_pos,
                            target, vertex_header,
-                          DRAW_JIT_VERTEX_PRE_CLIP_POS);
+                          DRAW_JIT_VERTEX_CLIP_POS);
     LP_CHECK_MEMBER_OFFSET(struct vertex_header, data,
                            target, vertex_header,
                            DRAW_JIT_VERTEX_DATA);
@@ -826,7 +826,7 @@ store_aos(struct gallivm_state *gallivm,
   * struct vertex_header {
   *    unsigned clipmask:DRAW_TOTAL_CLIP_PLANES;
   *    unsigned edgeflag:1;
- *    unsigned have_clipdist:1;
+ *    unsigned pad:1;
   *    unsigned vertex_id:16;
   *    [...]
   * }
@@ -838,7 +838,7 @@ store_aos(struct gallivm_state *gallivm,
   * {
   *   return (x >> 16) |              // vertex_id
   *          ((x & 0x3fff) << 18) |   // clipmask
- *          ((x & 0x4000) << 3) |    // have_clipdist
+ *          ((x & 0x4000) << 3) |    // pad
   *          ((x & 0x8000) << 1);     // edgeflag
   * }
   */
@@ -850,19 +850,23 @@ adjust_mask(struct gallivm_state *gallivm,
     LLVMBuilderRef builder = gallivm->builder;
     LLVMValueRef vertex_id;
     LLVMValueRef clipmask;
-   LLVMValueRef have_clipdist;
+   LLVMValueRef pad;
     LLVMValueRef edgeflag;
  
     vertex_id = LLVMBuildLShr(builder, mask, lp_build_const_int32(gallivm, 16), "");
     clipmask  = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x3fff), "");
     clipmask  = LLVMBuildShl(builder, clipmask, lp_build_const_int32(gallivm, 18), "");
-   have_clipdist = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x4000), "");
-   have_clipdist = LLVMBuildShl(builder, have_clipdist, lp_build_const_int32(gallivm, 3), "");
+   if (0) {
+      pad = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x4000), "");
+      pad = LLVMBuildShl(builder, pad, lp_build_const_int32(gallivm, 3), "");
+   }
     edgeflag = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x8000), "");
     edgeflag = LLVMBuildShl(builder, edgeflag, lp_build_const_int32(gallivm, 1), "");
  
     mask = LLVMBuildOr(builder, vertex_id, clipmask, "");
-   mask = LLVMBuildOr(builder, mask, have_clipdist, "");
+   if (0) {
+      mask = LLVMBuildOr(builder, mask, pad, "");
+   }
     mask = LLVMBuildOr(builder, mask, edgeflag, "");
  #endif
     return mask;
@@ -877,7 +881,7 @@ store_aos_array(struct gallivm_state *gallivm,
                  int attrib,
                  int num_outputs,
                  LLVMValueRef clipmask,
-                boolean have_clipdist)
+                boolean need_edgeflag)
  {
     LLVMBuilderRef builder = gallivm->builder;
     LLVMValueRef attr_index = lp_build_const_int32(gallivm, attrib);
@@ -908,11 +912,15 @@ store_aos_array(struct gallivm_state *gallivm,
         * code here.  See struct vertex_header in draw_private.h.
         */
        assert(DRAW_TOTAL_CLIP_PLANES==14);
-      /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */
-      vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
-      if (have_clipdist)
-         vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1);
-      val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type), vertex_id_pad_edgeflag);
+      /* initialize vertex id:16 = 0xffff, pad:1 = 0, edgeflag:1 = 1 */
+      if (!need_edgeflag) {
+         vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
+      }
+      else {
+         vertex_id_pad_edgeflag = (0xffff << 16);
+      }
+      val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type),
+                                   vertex_id_pad_edgeflag);
        /* OR with the clipmask */
        cliptmp = LLVMBuildOr(builder, val, clipmask, "");
        for (i = 0; i < vector_length; i++) {
@@ -942,7 +950,7 @@ convert_to_aos(struct gallivm_state *gallivm,
                 LLVMValueRef clipmask,
                 int num_outputs,
                 struct lp_type soa_type,
-               boolean have_clipdist)
+               boolean need_edgeflag)
  {
     LLVMBuilderRef builder = gallivm->builder;
     unsigned chan, attrib, i;
@@ -998,7 +1006,8 @@ convert_to_aos(struct gallivm_state *gallivm,
                        aos,
                        attrib,
                        num_outputs,
-                      clipmask, have_clipdist);
+                      clipmask,
+                      need_edgeflag);
     }
  #if DEBUG_STORE
     lp_build_printf(gallivm, "   # storing end\n");
@@ -1014,7 +1023,7 @@ store_clip(struct gallivm_state *gallivm,
             const struct lp_type vs_type,
             LLVMValueRef io_ptr,
             LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
-           boolean pre_clip_pos, int idx)
+           int idx)
  {
     LLVMBuilderRef builder = gallivm->builder;
     LLVMValueRef soa[4];
@@ -1041,14 +1050,8 @@ store_clip(struct gallivm_state *gallivm,
     soa[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 .. zn*/
     soa[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 .. wn*/
  
-   if (!pre_clip_pos) {
-      for (i = 0; i < vs_type.length; i++) {
-         clip_ptrs[i] = draw_jit_header_clip(gallivm, io_ptrs[i]);
-      }
-   } else {
-      for (i = 0; i < vs_type.length; i++) {
-         clip_ptrs[i] = draw_jit_header_pre_clip_pos(gallivm, io_ptrs[i]);
-      }
+   for (i = 0; i < vs_type.length; i++) {
+      clip_ptrs[i] = draw_jit_header_clip_pos(gallivm, io_ptrs[i]);
     }
  
     lp_build_transpose_aos(gallivm, vs_type, soa, soa);
@@ -1140,11 +1143,7 @@ generate_clipmask(struct draw_llvm *llvm,
                    struct gallivm_state *gallivm,
                    struct lp_type vs_type,
                    LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
-                  boolean clip_xy,
-                  boolean clip_z,
-                  boolean clip_user,
-                  boolean clip_halfz,
-                  unsigned ucp_enable,
+                  struct draw_llvm_variant_key *key,
                    LLVMValueRef context_ptr,
                    boolean *have_clipdist)
  {
@@ -1160,7 +1159,9 @@ generate_clipmask(struct draw_llvm *llvm,
     const unsigned pos = llvm->draw->vs.position_output;
     const unsigned cv = llvm->draw->vs.clipvertex_output;
     int num_written_clipdistance = llvm->draw->vs.vertex_shader->info.num_written_clipdistance;
-   bool have_cd = false;
+   boolean have_cd = false;
+   boolean clip_user = key->clip_user;
+   unsigned ucp_enable = key->ucp_enable;
     unsigned cd[2];
  
     cd[0] = llvm->draw->vs.clipdistance_output[0];
@@ -1200,8 +1201,16 @@ generate_clipmask(struct draw_llvm *llvm,
        cv_w = pos_w;
     }
  
+   /*
+    * Be careful with the comparisons and NaNs (using llvm's unordered
+    * comparisons here).
+    */
     /* Cliptest, for hardwired planes */
-   if (clip_xy) {
+   /*
+    * XXX should take guardband into account (currently not in key).
+    * Otherwise might run the draw pipeline stages for nothing.
+    */
+   if (key->clip_xy) {
        /* plane 1 */
        test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w);
        temp = shift;
@@ -1229,9 +1238,9 @@ generate_clipmask(struct draw_llvm *llvm,
        mask = LLVMBuildOr(builder, mask, test, "");
     }
  
-   if (clip_z) {
+   if (key->clip_z) {
        temp = lp_build_const_int_vec(gallivm, i32_type, 16);
-      if (clip_halfz) {
+      if (key->clip_halfz) {
           /* plane 5 */
           test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, pos_z);
           test = LLVMBuildAnd(builder, test, temp, "");
@@ -1318,6 +1327,20 @@ generate_clipmask(struct draw_llvm *llvm,
           }
        }
     }
+   if (key->need_edgeflags) {
+      /*
+       * This isn't really part of clipmask but stored the same in vertex
+       * header later, so do it here.
+       */
+      unsigned edge_attr = llvm->draw->vs.edgeflag_output;
+      LLVMValueRef one = lp_build_const_vec(gallivm, f32_type, 1.0);
+      LLVMValueRef edgeflag = LLVMBuildLoad(builder, outputs[edge_attr][0], "");
+      test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_EQUAL, one, edgeflag);
+      temp = lp_build_const_int_vec(gallivm, i32_type,
+                                    1LL << DRAW_TOTAL_CLIP_PLANES);
+      test = LLVMBuildAnd(builder, test, temp, "");
+      mask = LLVMBuildOr(builder, mask, test, "");
+   }
     return mask;
  }
  
@@ -1329,7 +1352,8 @@ generate_clipmask(struct draw_llvm *llvm,
  static LLVMValueRef
  clipmask_booli32(struct gallivm_state *gallivm,
                   const struct lp_type vs_type,
-                 LLVMValueRef clipmask_bool_ptr)
+                 LLVMValueRef clipmask_bool_ptr,
+                 boolean edgeflag_in_clipmask)
  {
     LLVMBuilderRef builder = gallivm->builder;
     LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
@@ -1339,8 +1363,18 @@ clipmask_booli32(struct gallivm_state *gallivm,
     int i;
  
     /*
-    * Can do this with log2(vector length) pack instructions and one extract
-    * (as we don't actually need a or) with sse2 which would be way better.
+    * We need to invert the edgeflag bit from the clipmask here
+    * (because the result is really if we want to run the pipeline or not
+    * and we (may) need it if edgeflag was 0).
+    */
+   if (edgeflag_in_clipmask) {
+      struct lp_type i32_type = lp_int_type(vs_type);
+      LLVMValueRef edge = lp_build_const_int_vec(gallivm, i32_type,
+                                                 1LL << DRAW_TOTAL_CLIP_PLANES);
+      clipmask_bool = LLVMBuildXor(builder, clipmask_bool, edge, "");
+   }
+   /*
+    * Could do much better with just cmp/movmskps.
      */
     for (i=0; i < vs_type.length; i++) {
        temp = LLVMBuildExtractElement(builder, clipmask_bool,
@@ -1536,8 +1570,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
     const boolean bypass_viewport = key->has_gs || key->bypass_viewport ||
                                     llvm->draw->vs.vertex_shader->info.writes_viewport_index;
     const boolean enable_cliptest = !key->has_gs && (key->clip_xy ||
-                                                    key->clip_z  ||
-                                                    key->clip_user);
+                                                    key->clip_z ||
+                                                    key->clip_user ||
+                                                    key->need_edgeflags);
     LLVMValueRef variant_func;
     const unsigned pos = llvm->draw->vs.position_output;
     const unsigned cv = llvm->draw->vs.clipvertex_output;
@@ -1766,8 +1801,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
  
        if (pos != -1 && cv != -1) {
           /* store original positions in clip before further manipulation */
-         store_clip(gallivm, vs_type, io, outputs, FALSE, key->clip_user ? cv : pos);
-         store_clip(gallivm, vs_type, io, outputs, TRUE, pos);
+         store_clip(gallivm, vs_type, io, outputs, pos);
  
           /* do cliptest */
           if (enable_cliptest) {
@@ -1777,11 +1811,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
                                           gallivm,
                                           vs_type,
                                           outputs,
-                                         key->clip_xy,
-                                         key->clip_z,
-                                         key->clip_user,
-                                         key->clip_halfz,
-                                         key->ucp_enable,
+                                         key,
                                           context_ptr, &have_clipdist);
              temp = LLVMBuildOr(builder, clipmask, temp, "");
              /* store temporary clipping boolean value */
@@ -1806,14 +1836,15 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
         */
        convert_to_aos(gallivm, io, NULL, outputs, clipmask,
                       vs_info->num_outputs, vs_type,
-                     have_clipdist);
+                     enable_cliptest && key->need_edgeflags);
     }
     lp_build_loop_end_cond(&lp_loop, count, step, LLVMIntUGE);
  
     sampler->destroy(sampler);
  
     /* return clipping boolean value for function */
-   ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr);
+   ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr,
+                          enable_cliptest && key->need_edgeflags);
  
     LLVMBuildRet(builder, ret);
  
@@ -1830,6 +1861,8 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
  
     key = (struct draw_llvm_variant_key *)store;
  
+   memset(key, 0, offsetof(struct draw_llvm_variant_key, vertex_element[0]));
+
     key->clamp_vertex_color = llvm->draw->rasterizer->clamp_vertex_color; /**/
  
     /* Presumably all variants of the shader should have the same
@@ -1847,11 +1880,11 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
     key->clip_user = llvm->draw->clip_user;
     key->bypass_viewport = llvm->draw->bypass_viewport;
     key->clip_halfz = llvm->draw->rasterizer->clip_halfz;
+   /* XXX assumes edgeflag output not at 0 */
     key->need_edgeflags = (llvm->draw->vs.edgeflag_output ? TRUE : FALSE);
     key->ucp_enable = llvm->draw->rasterizer->clip_plane_enable;
     key->has_gs = llvm->draw->gs.geometry_shader != NULL;
     key->num_outputs = draw_total_vs_outputs(llvm->draw);
-   key->pad1 = 0;
  
     /* All variants of this shader will have the same value for
      * nr_samplers.  Not yet trying to compact away holes in the
@@ -2283,6 +2316,8 @@ draw_gs_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
  
     key = (struct draw_gs_llvm_variant_key *)store;
  
+   memset(key, 0, offsetof(struct draw_gs_llvm_variant_key, samplers[0]));
+
     key->num_outputs = draw_total_gs_outputs(llvm->draw);
  
     /* All variants of this shader will have the same value for
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h

index d153c16..271433c 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -104,8 +104,7 @@ enum {
  
  enum {
     DRAW_JIT_VERTEX_VERTEX_ID = 0,
-   DRAW_JIT_VERTEX_CLIP,
-   DRAW_JIT_VERTEX_PRE_CLIP_POS,
+   DRAW_JIT_VERTEX_CLIP_POS,
     DRAW_JIT_VERTEX_DATA
  };
  
@@ -162,11 +161,8 @@ enum {
  #define draw_jit_header_id(_gallivm, _ptr)              \
     lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_VERTEX_ID, "id")
  
-#define draw_jit_header_clip(_gallivm, _ptr) \
-   lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_CLIP, "clip")
-
-#define draw_jit_header_pre_clip_pos(_gallivm, _ptr) \
-   lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_PRE_CLIP_POS, "pre_clip_pos")
+#define draw_jit_header_clip_pos(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_CLIP_POS, "clip_pos")
  
  #define draw_jit_header_data(_gallivm, _ptr)            \
     lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_DATA, "data")
@@ -315,12 +311,8 @@ struct draw_llvm_variant_key
     unsigned need_edgeflags:1;
     unsigned has_gs:1;
     unsigned num_outputs:8;
-   /*
-    * it is important there are no holes in this struct
-    * (and all padding gets zeroed).
-    */
     unsigned ucp_enable:PIPE_MAX_CLIP_PLANES;
-   unsigned pad1:24-PIPE_MAX_CLIP_PLANES;
+   /* note padding here - must use memset */
  
     /* Variable number of vertex elements:
      */
@@ -336,6 +328,7 @@ struct draw_gs_llvm_variant_key
     unsigned nr_samplers:8;
     unsigned nr_sampler_views:8;
     unsigned num_outputs:8;
+   /* note padding here - must use memset */
  
     struct draw_sampler_static_state samplers[1];
  };
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c

index 877db59..e85ae16 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -646,6 +646,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
     struct pipe_context *pipe = draw->pipe;
     const struct pipe_rasterizer_state *rast = draw->rasterizer;
     uint num_samplers;
+   uint num_sampler_views;
     void *r;
  
     assert(draw->rasterizer->line_smooth);
@@ -667,9 +668,9 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
     draw_aaline_prepare_outputs(draw, draw->pipeline.aaline);
  
     /* how many samplers? */
-   /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
-   num_samplers = MAX2(aaline->num_sampler_views, aaline->num_samplers);
-   num_samplers = MAX2(num_samplers, aaline->fs->sampler_unit + 1);
+   /* we'll use sampler/texture[aaline->sampler_unit] for the alpha texture */
+   num_samplers = MAX2(aaline->num_samplers, aaline->fs->sampler_unit + 1);
+   num_sampler_views = MAX2(num_samplers, aaline->num_sampler_views);
  
     aaline->state.sampler[aaline->fs->sampler_unit] = aaline->sampler_cso;
     pipe_sampler_view_reference(&aaline->state.sampler_views[aaline->fs->sampler_unit],
@@ -681,7 +682,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
                                        num_samplers, aaline->state.sampler);
  
     aaline->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                    num_samplers, aaline->state.sampler_views);
+                                    num_sampler_views, aaline->state.sampler_views);
  
     /* Disable triangle culling, stippling, unfilled mode etc. */
     r = draw_get_rasterizer_no_cull(draw, rast->scissor, rast->flatshade);
@@ -937,7 +938,7 @@ draw_aaline_prepare_outputs(struct draw_context *draw,
     const struct pipe_rasterizer_state *rast = draw->rasterizer;
  
     /* update vertex attrib info */
-   aaline->pos_slot = draw_current_shader_position_output(draw);;
+   aaline->pos_slot = draw_current_shader_position_output(draw);
  
     if (!rast->line_smooth)
        return;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c

index 47765cd..2d92d65 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -59,6 +59,8 @@ struct clip_stage {
     struct draw_stage stage;      /**< base class */
  
     unsigned pos_attr;
+   boolean have_clipdist;
+   int cv_attr;
  
     /* List of the attributes to be constant interpolated. */
     uint num_const_attribs;
@@ -145,20 +147,23 @@ static void interp(const struct clip_stage *clip,
      */
     dst->clipmask = 0;
     dst->edgeflag = 0;        /* will get overwritten later */
-   dst->have_clipdist = in->have_clipdist;
+   dst->pad = 0;
     dst->vertex_id = UNDEFINED_VERTEX_ID;
  
     /* Interpolate the clip-space coords.
      */
-   interp_attr(dst->clip, t, in->clip, out->clip);
+   if (clip->cv_attr >= 0) {
+      interp_attr(dst->data[clip->cv_attr], t,
+                  in->data[clip->cv_attr], out->data[clip->cv_attr]);
+   }
     /* interpolate the clip-space position */
-   interp_attr(dst->pre_clip_pos, t, in->pre_clip_pos, out->pre_clip_pos);
+   interp_attr(dst->clip_pos, t, in->clip_pos, out->clip_pos);
  
     /* Do the projective divide and viewport transformation to get
      * new window coordinates:
      */
     {
-      const float *pos = dst->pre_clip_pos;
+      const float *pos = dst->clip_pos;
        const float *scale =
           clip->stage.draw->viewports[viewport_index].scale;
        const float *trans =
@@ -192,11 +197,11 @@ static void interp(const struct clip_stage *clip,
        t_nopersp = t;
        /* find either in.x != out.x or in.y != out.y */
        for (k = 0; k < 2; k++) {
-         if (in->pre_clip_pos[k] != out->pre_clip_pos[k]) {
+         if (in->clip_pos[k] != out->clip_pos[k]) {
              /* do divide by W, then compute linear interpolation factor */
-            float in_coord = in->pre_clip_pos[k] / in->pre_clip_pos[3];
-            float out_coord = out->pre_clip_pos[k] / out->pre_clip_pos[3];
-            float dst_coord = dst->pre_clip_pos[k] / dst->pre_clip_pos[3];
+            float in_coord = in->clip_pos[k] / in->clip_pos[3];
+            float out_coord = out->clip_pos[k] / out->clip_pos[3];
+            float dst_coord = dst->clip_pos[k] / dst->clip_pos[3];
              t_nopersp = (dst_coord - out_coord) / (in_coord - out_coord);
              break;
           }
@@ -214,9 +219,9 @@ static void interp(const struct clip_stage *clip,
   * Triangle is considered null/empty if its area is equal to zero.
   */
  static inline boolean
-is_tri_null(struct draw_context *draw, const struct prim_header *header)
+is_tri_null(const struct clip_stage *clip, const struct prim_header *header)
  {
-   const unsigned pos_attr = draw_current_shader_position_output(draw);
+   const unsigned pos_attr = clip->pos_attr;
     float x1 = header->v[1]->data[pos_attr][0] - header->v[0]->data[pos_attr][0];
     float y1 = header->v[1]->data[pos_attr][1] - header->v[0]->data[pos_attr][1];
     float z1 = header->v[1]->data[pos_attr][2] - header->v[0]->data[pos_attr][2];
@@ -242,6 +247,7 @@ static void emit_poly(struct draw_stage *stage,
                        unsigned n,
                        const struct prim_header *origPrim)
  {
+   const struct clip_stage *clipper = clip_stage(stage);
     struct prim_header header;
     unsigned i;
     ushort edge_first, edge_middle, edge_last;
@@ -281,7 +287,7 @@ static void emit_poly(struct draw_stage *stage,
           header.v[2] = inlist[0];  /* the provoking vertex */
        }
  
-      tri_null = is_tri_null(stage->draw, &header);
+      tri_null = is_tri_null(clipper, &header);
        /* If we generated a triangle with an area, aka. non-null triangle,
         * or if the previous triangle was also null then skip all subsequent
         * null triangles */
@@ -306,11 +312,18 @@ static void emit_poly(struct draw_stage *stage,
           debug_printf("Clipped tri: (flat-shade-first = %d)\n",
                        stage->draw->rasterizer->flatshade_first);
           for (j = 0; j < 3; j++) {
-            debug_printf("  Vert %d: clip: %f %f %f %f\n", j,
-                         header.v[j]->clip[0],
-                         header.v[j]->clip[1],
-                         header.v[j]->clip[2],
-                         header.v[j]->clip[3]);
+            debug_printf("  Vert %d: clip pos: %f %f %f %f\n", j,
+                         header.v[j]->clip_pos[0],
+                         header.v[j]->clip_pos[1],
+                         header.v[j]->clip_pos[2],
+                         header.v[j]->clip_pos[3]);
+            if (clipper->cv_attr >= 0) {
+               debug_printf("  Vert %d: cv: %f %f %f %f\n", j,
+                            header.v[j]->data[clipper->cv_attr][0],
+                            header.v[j]->data[clipper->cv_attr][1],
+                            header.v[j]->data[clipper->cv_attr][2],
+                            header.v[j]->data[clipper->cv_attr][3]);
+            }
              for (k = 0; k < draw_num_shader_outputs(stage->draw); k++) {
                 debug_printf("  Vert %d: Attr %d:  %f %f %f %f\n", j, k,
                              header.v[j]->data[k][0],
@@ -320,7 +333,7 @@ static void emit_poly(struct draw_stage *stage,
              }
           }
        }
-      stage->next->tri( stage->next, &header );
+      stage->next->tri(stage->next, &header);
     }
  }
  
@@ -345,15 +358,28 @@ static inline float getclipdist(const struct clip_stage *clipper,
  {
     const float *plane;
     float dp;
-   if (vert->have_clipdist && plane_idx >= 6) {
+   if (plane_idx < 6) {
+      /* ordinary xyz view volume clipping uses pos output */
+      plane = clipper->plane[plane_idx];
+      dp = dot4(vert->clip_pos, plane);
+   }
+   else if (clipper->have_clipdist) {
        /* pick the correct clipdistance element from the output vectors */
        int _idx = plane_idx - 6;
        int cdi = _idx >= 4;
        int vidx = cdi ? _idx - 4 : _idx;
        dp = vert->data[draw_current_shader_clipdistance_output(clipper->stage.draw, cdi)][vidx];
     } else {
+      /*
+       * legacy user clip planes or gl_ClipVertex
+       */
        plane = clipper->plane[plane_idx];
-      dp = dot4(vert->clip, plane);
+      if (clipper->cv_attr >= 0) {
+         dp = dot4(vert->data[clipper->cv_attr], plane);
+      }
+      else {
+         dp = dot4(vert->clip_pos, plane);
+      }
     }
     return dp;
  }
@@ -400,13 +426,22 @@ do_clip_tri(struct draw_stage *stage,
     viewport_index = draw_viewport_index(clipper->stage.draw, prov_vertex);
  
     if (DEBUG_CLIP) {
-      const float *v0 = header->v[0]->clip;
-      const float *v1 = header->v[1]->clip;
-      const float *v2 = header->v[2]->clip;
-      debug_printf("Clip triangle:\n");
+      const float *v0 = header->v[0]->clip_pos;
+      const float *v1 = header->v[1]->clip_pos;
+      const float *v2 = header->v[2]->clip_pos;
+      debug_printf("Clip triangle pos:\n");
        debug_printf(" %f, %f, %f, %f\n", v0[0], v0[1], v0[2], v0[3]);
        debug_printf(" %f, %f, %f, %f\n", v1[0], v1[1], v1[2], v1[3]);
        debug_printf(" %f, %f, %f, %f\n", v2[0], v2[1], v2[2], v2[3]);
+      if (clipper->cv_attr >= 0) {
+         const float *v0 = header->v[0]->data[clipper->cv_attr];
+         const float *v1 = header->v[1]->data[clipper->cv_attr];
+         const float *v2 = header->v[2]->data[clipper->cv_attr];
+         debug_printf("Clip triangle cv:\n");
+         debug_printf(" %f, %f, %f, %f\n", v0[0], v0[1], v0[2], v0[3]);
+         debug_printf(" %f, %f, %f, %f\n", v1[0], v1[1], v1[2], v1[3]);
+         debug_printf(" %f, %f, %f, %f\n", v2[0], v2[1], v2[2], v2[3]);
+      }
     }
  
     /*
@@ -555,7 +590,7 @@ do_clip_tri(struct draw_stage *stage,
  
        /* Emit the polygon as triangles to the setup stage:
         */
-      emit_poly( stage, inlist, inEdges, n, header );
+      emit_poly(stage, inlist, inEdges, n, header);
     }
  }
  
@@ -567,7 +602,7 @@ do_clip_line(struct draw_stage *stage,
               struct prim_header *header,
               unsigned clipmask)
  {
-   const struct clip_stage *clipper = clip_stage( stage );
+   const struct clip_stage *clipper = clip_stage(stage);
     struct vertex_header *v0 = header->v[0];
     struct vertex_header *v1 = header->v[1];
     struct vertex_header *prov_vertex;
@@ -576,6 +611,8 @@ do_clip_line(struct draw_stage *stage,
     struct prim_header newprim;
     int viewport_index;
  
+   newprim.flags = header->flags;
+
     if (stage->draw->rasterizer->flatshade_first) {
        prov_vertex = v0;
     }
@@ -671,9 +708,9 @@ clip_point_guard_xy(struct draw_stage *stage, struct prim_header *header)
            * automatically). These would usually be captured by depth clip
            * too but this can be disabled.
            */
-         if (header->v[0]->clip[3] <= 0.0f ||
-             util_is_inf_or_nan(header->v[0]->clip[0]) ||
-             util_is_inf_or_nan(header->v[0]->clip[1]))
+         if (header->v[0]->clip_pos[3] <= 0.0f ||
+             util_is_inf_or_nan(header->v[0]->clip_pos[0]) ||
+             util_is_inf_or_nan(header->v[0]->clip_pos[1]))
              return;
        }
        stage->next->point(stage->next, header);
@@ -773,7 +810,7 @@ find_interp(const struct draw_fragment_shader *fs, int *indexed_interp,
  static void 
  clip_init_state(struct draw_stage *stage)
  {
-   struct clip_stage *clipper = clip_stage( stage );
+   struct clip_stage *clipper = clip_stage(stage);
     const struct draw_context *draw = stage->draw;
     const struct draw_fragment_shader *fs = draw->fs.fragment_shader;
     const struct tgsi_shader_info *info = draw_get_shader_info(draw);
@@ -781,6 +818,13 @@ clip_init_state(struct draw_stage *stage)
     int indexed_interp[2];
  
     clipper->pos_attr = draw_current_shader_position_output(draw);
+   clipper->have_clipdist = draw_current_shader_num_written_clipdistances(draw) > 0;
+   if (draw_current_shader_clipvertex_output(draw) != clipper->pos_attr) {
+      clipper->cv_attr = (int)draw_current_shader_clipvertex_output(draw);
+   }
+   else {
+      clipper->cv_attr = -1;
+   }
  
     /* We need to know for each attribute what kind of interpolation is
      * done on it (flat, smooth or noperspective).  But the information
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c

index b58d753..cf52ca4 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -477,6 +477,7 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
     struct pipe_context *pipe = pstip->pipe;
     struct draw_context *draw = stage->draw;
     uint num_samplers;
+   uint num_sampler_views;
  
     assert(stage->draw->rasterizer->poly_stipple_enable);
  
@@ -490,8 +491,8 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
  
     /* how many samplers? */
     /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
-   num_samplers = MAX2(pstip->num_sampler_views, pstip->num_samplers);
-   num_samplers = MAX2(num_samplers, pstip->fs->sampler_unit + 1);
+   num_samplers = MAX2(pstip->num_samplers, pstip->fs->sampler_unit + 1);
+   num_sampler_views = MAX2(pstip->num_sampler_views, num_samplers);
  
     /* plug in our sampler, texture */
     pstip->state.samplers[pstip->fs->sampler_unit] = pstip->sampler_cso;
@@ -506,7 +507,7 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
                                       num_samplers, pstip->state.samplers);
  
     pstip->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                   num_samplers, pstip->state.sampler_views);
+                                   num_sampler_views, pstip->state.sampler_views);
  
     draw->suspend_flushing = FALSE;
  
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c

index 2517d61..c465c75 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -86,27 +86,33 @@ inject_front_face_info(struct draw_stage *stage,
  }
  
     
-static void point( struct draw_stage *stage,
-                  struct vertex_header *v0 )
+static void point(struct draw_stage *stage,
+                  struct prim_header *header,
+                  struct vertex_header *v0)
  {
     struct prim_header tmp;
+   tmp.det = header->det;
+   tmp.flags = 0;
     tmp.v[0] = v0;
-   stage->next->point( stage->next, &tmp );
+   stage->next->point(stage->next, &tmp);
  }
  
-static void line( struct draw_stage *stage,
-                 struct vertex_header *v0,
-                 struct vertex_header *v1 )
+static void line(struct draw_stage *stage,
+                 struct prim_header *header,
+                 struct vertex_header *v0,
+                 struct vertex_header *v1)
  {
     struct prim_header tmp;
+   tmp.det = header->det;
+   tmp.flags = 0;
     tmp.v[0] = v0;
     tmp.v[1] = v1;
-   stage->next->line( stage->next, &tmp );
+   stage->next->line(stage->next, &tmp);
  }
  
  
-static void points( struct draw_stage *stage,
-                   struct prim_header *header )
+static void points(struct draw_stage *stage,
+                   struct prim_header *header)
  {
     struct vertex_header *v0 = header->v[0];
     struct vertex_header *v1 = header->v[1];
@@ -114,27 +120,41 @@ static void points( struct draw_stage *stage,
  
     inject_front_face_info(stage, header);
  
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag) point( stage, v0 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag) point( stage, v1 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag) point( stage, v2 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag)
+      point(stage, header, v0);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag)
+      point(stage, header, v1);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag)
+      point(stage, header, v2);
  }
  
  
-static void lines( struct draw_stage *stage,
-                  struct prim_header *header )
+static void lines(struct draw_stage *stage,
+                  struct prim_header *header)
  {
     struct vertex_header *v0 = header->v[0];
     struct vertex_header *v1 = header->v[1];
     struct vertex_header *v2 = header->v[2];
  
     if (header->flags & DRAW_PIPE_RESET_STIPPLE)
-      stage->next->reset_stipple_counter( stage->next );
+      /*
+       * XXX could revisit this. The only stage which cares is the line
+       * stipple stage. Could just emit correct reset flags here and not
+       * bother about all the calling through reset_stipple_counter
+       * stages. Though technically it is necessary if line stipple is
+       * handled by the driver, but this is not actually hooked up when
+       * using vbuf (vbuf stage reset_stipple_counter does nothing).
+       */
+      stage->next->reset_stipple_counter(stage->next);
  
     inject_front_face_info(stage, header);
  
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag) line( stage, v2, v0 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag) line( stage, v0, v1 );
-   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag) line( stage, v1, v2 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag)
+      line(stage, header, v2, v0);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag)
+      line(stage, header, v0, v1);
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag)
+      line(stage, header, v1, v2);
  }
  
  
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c

index f36706c..6df7149 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -74,9 +74,10 @@ struct vbuf_stage {
     unsigned max_indices;
     unsigned nr_indices;
  
-   /* Cache point size somewhere it's address won't change:
+   /* Cache point size somewhere its address won't change:
      */
     float point_size;
+   float zero4[4];
  
     struct translate_cache *cache;
  };
@@ -205,6 +206,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
     struct translate_key hw_key;
     unsigned dst_offset;
     unsigned i;
+   const struct vertex_info *vinfo;
  
     vbuf->render->set_primitive(vbuf->render, prim);
  
@@ -215,27 +217,33 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
      * state change.
      */
     vbuf->vinfo = vbuf->render->get_vertex_info(vbuf->render);
-   vbuf->vertex_size = vbuf->vinfo->size * sizeof(float);
+   vinfo = vbuf->vinfo;
+   vbuf->vertex_size = vinfo->size * sizeof(float);
  
     /* Translate from pipeline vertices to hw vertices.
      */
     dst_offset = 0;
  
-   for (i = 0; i < vbuf->vinfo->num_attribs; i++) {
+   for (i = 0; i < vinfo->num_attribs; i++) {
        unsigned emit_sz = 0;
        unsigned src_buffer = 0;
        enum pipe_format output_format;
-      unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
  
-      output_format = draw_translate_vinfo_format(vbuf->vinfo->attrib[i].emit);
-      emit_sz = draw_translate_vinfo_size(vbuf->vinfo->attrib[i].emit);
+      output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
+      emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
  
        /* doesn't handle EMIT_OMIT */
        assert(emit_sz != 0);
  
-      if (vbuf->vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
-        src_buffer = 1;
-        src_offset = 0;
+      if (vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
+         src_buffer = 1;
+         src_offset = 0;
+      }
+      else if (vinfo->attrib[i].src_index == DRAW_ATTR_NONEXIST) {
+         /* elements which don't exist will get assigned zeros */
+         src_buffer = 2;
+         src_offset = 0;
        }
  
        hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
@@ -249,7 +257,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
        dst_offset += emit_sz;
     }
  
-   hw_key.nr_elements = vbuf->vinfo->num_attribs;
+   hw_key.nr_elements = vinfo->num_attribs;
     hw_key.output_stride = vbuf->vertex_size;
  
     /* Don't bother with caching at this stage:
@@ -261,6 +269,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
        vbuf->translate = translate_cache_find(vbuf->cache, &hw_key);
  
        vbuf->translate->set_buffer(vbuf->translate, 1, &vbuf->point_size, 0, ~0);
+      vbuf->translate->set_buffer(vbuf->translate, 2, &vbuf->zero4[0], 0, ~0);
     }
  
     vbuf->point_size = vbuf->stage.draw->rasterizer->point_size;
@@ -428,7 +437,7 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
     struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
     if (!vbuf)
        goto fail;
-   
+
     vbuf->stage.draw = draw;
     vbuf->stage.name = "vbuf";
     vbuf->stage.point = vbuf_first_point;
@@ -437,29 +446,30 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
     vbuf->stage.flush = vbuf_flush;
     vbuf->stage.reset_stipple_counter = vbuf_reset_stipple_counter;
     vbuf->stage.destroy = vbuf_destroy;
-   
+
     vbuf->render = render;
     vbuf->max_indices = MIN2(render->max_indices, UNDEFINED_VERTEX_ID-1);
  
-   vbuf->indices = (ushort *) align_malloc( vbuf->max_indices * 
-                                           sizeof(vbuf->indices[0]), 
-                                           16 );
+   vbuf->indices = (ushort *) align_malloc(vbuf->max_indices *
+                    sizeof(vbuf->indices[0]),
+                    16);
     if (!vbuf->indices)
        goto fail;
  
     vbuf->cache = translate_cache_create();
-   if (!vbuf->cache) 
+   if (!vbuf->cache)
        goto fail;
-      
-   
+
     vbuf->vertices = NULL;
     vbuf->vertex_ptr = vbuf->vertices;
-   
+
+   vbuf->zero4[0] = vbuf->zero4[1] = vbuf->zero4[2] = vbuf->zero4[3] = 0.0f;
+
     return &vbuf->stage;
  
- fail:
+fail:
     if (vbuf)
        vbuf_destroy(&vbuf->stage);
-   
+
     return NULL;
  }
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h

index 5584c4a..8774beb 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -86,11 +86,10 @@ struct draw_vertex_buffer {
  struct vertex_header {
     unsigned clipmask:DRAW_TOTAL_CLIP_PLANES;
     unsigned edgeflag:1;
-   unsigned have_clipdist:1;
+   unsigned pad:1;
     unsigned vertex_id:16;
  
-   float clip[4];
-   float pre_clip_pos[4];
+   float clip_pos[4];
  
     /* This will probably become float (*data)[4] soon:
      */
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c

index 0204b43..5a49acb 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -524,7 +524,7 @@ draw_vbo(struct draw_context *draw,
  #endif
     {
        if (index_limit == 0) {
-      /* one of the buffers is too small to do any valid drawing */
+         /* one of the buffers is too small to do any valid drawing */
           debug_warning("draw: VBO too small to draw anything\n");
           util_fpstate_set(fpstate);
           return;
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c

index d1eafd8..6fb630b 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -44,6 +44,9 @@ struct pt_emit {
     unsigned prim;
  
     const struct vertex_info *vinfo;
+
+   float zero4[4];
+
  };
  
  
@@ -60,7 +63,7 @@ draw_pt_emit_prepare(struct pt_emit *emit,
  
     /* XXX: need to flush to get prim_vbuf.c to release its allocation??
      */
-   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+   draw_do_flush(draw, DRAW_FLUSH_BACKEND);
  
     /* XXX: may need to defensively reset this later on as clipping can
      * clobber this state in the render backend.
@@ -80,7 +83,7 @@ draw_pt_emit_prepare(struct pt_emit *emit,
        unsigned emit_sz = 0;
        unsigned src_buffer = 0;
        unsigned output_format;
-      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
+      unsigned src_offset = vinfo->attrib[i].src_index * 4 * sizeof(float);
  
        output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
        emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
@@ -89,8 +92,13 @@ draw_pt_emit_prepare(struct pt_emit *emit,
        assert(emit_sz != 0);
  
        if (vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
-        src_buffer = 1;
-        src_offset = 0;
+         src_buffer = 1;
+         src_offset = 0;
+      }
+      else if (vinfo->attrib[i].src_index == DRAW_ATTR_NONEXIST) {
+         /* elements which don't exist will get assigned zeros */
+         src_buffer = 2;
+         src_offset = 0;
        }
  
        hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
@@ -111,6 +119,8 @@ draw_pt_emit_prepare(struct pt_emit *emit,
         translate_key_compare(&emit->translate->key, &hw_key) != 0) {
        translate_key_sanitize(&hw_key);
        emit->translate = translate_cache_find(emit->cache, &hw_key);
+
+      emit->translate->set_buffer(emit->translate, 2, &emit->zero4[0], 0, ~0);
     }
  
     if (!vinfo->size)
@@ -138,7 +148,7 @@ draw_pt_emit(struct pt_emit *emit,
  
     /* XXX: need to flush to get prim_vbuf.c to release its allocation??
      */
-   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+   draw_do_flush(draw, DRAW_FLUSH_BACKEND);
  
     if (vertex_count == 0)
        return;
@@ -152,31 +162,31 @@ draw_pt_emit(struct pt_emit *emit,
                               (ushort)translate->key.output_stride,
                               (ushort)vertex_count);
  
-   hw_verts = render->map_vertices( render );
+   hw_verts = render->map_vertices(render);
     if (!hw_verts) {
        debug_warn_once("map of vertex buffer failed (out of memory?)");
        return;
     }
  
     translate->set_buffer(translate,
-                        0,
-                        vertex_data,
-                        stride,
-                        ~0);
+                         0,
+                         vertex_data,
+                         stride,
+                         ~0);
  
     translate->set_buffer(translate,
-                        1,
-                        &draw->rasterizer->point_size,
-                        0,
-                        ~0);
+                         1,
+                         &draw->rasterizer->point_size,
+                         0,
+                         ~0);
  
     /* fetch/translate vertex attribs to fill hw_verts[] */
     translate->run(translate,
-                 0,
-                 vertex_count,
-                  draw->start_instance,
-                  draw->instance_id,
-                 hw_verts );
+                  0,
+                  vertex_count,
+                  0,
+                  0,
+                  hw_verts);
  
     render->unmap_vertices(render, 0, vertex_count - 1);
  
@@ -212,7 +222,7 @@ draw_pt_emit_linear(struct pt_emit *emit,
  #endif
     /* XXX: need to flush to get prim_vbuf.c to release its allocation??
      */
-   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+   draw_do_flush(draw, DRAW_FLUSH_BACKEND);
  
     /* XXX: and work out some way to coordinate the render primitive
      * between vbuf.c and here...
@@ -224,35 +234,35 @@ draw_pt_emit_linear(struct pt_emit *emit,
                                    (ushort)count))
        goto fail;
  
-   hw_verts = render->map_vertices( render );
+   hw_verts = render->map_vertices(render);
     if (!hw_verts)
        goto fail;
  
     translate->set_buffer(translate, 0,
-                        vertex_data, stride, count - 1);
+                         vertex_data, stride, count - 1);
  
     translate->set_buffer(translate, 1,
-                        &draw->rasterizer->point_size,
-                        0, ~0);
+                         &draw->rasterizer->point_size,
+                         0, ~0);
  
     translate->run(translate,
                    0,
                    count,
-                  draw->start_instance,
-                  draw->instance_id,
+                  0,
+                  0,
                    hw_verts);
  
     if (0) {
        unsigned i;
        for (i = 0; i < count; i++) {
           debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i);
-         draw_dump_emitted_vertex( emit->vinfo,
-                                   (const uint8_t *)hw_verts +
-                                   translate->key.output_stride * i );
+         draw_dump_emitted_vertex(emit->vinfo,
+                                  (const uint8_t *)hw_verts +
+                                  translate->key.output_stride * i);
        }
     }
  
-   render->unmap_vertices( render, 0, count - 1 );
+   render->unmap_vertices(render, 0, count - 1);
  
     for (start = i = 0;
          i < prim_info->primitive_count;
@@ -262,7 +272,7 @@ draw_pt_emit_linear(struct pt_emit *emit,
                            start,
                            prim_info->primitive_lengths[i]);
     }
-   
+
     render->release_vertices(render);
  
     return;
@@ -287,6 +297,8 @@ draw_pt_emit_create(struct draw_context *draw)
        return NULL;
     }
  
+   emit->zero4[0] = emit->zero4[1] = emit->zero4[2] = emit->zero4[3] = 0.0f;
+
     return emit;
  }
  
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c

index 2d7569b..edd4541 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -453,6 +453,7 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle,
                                 draw->vs.vertex_shader->info.writes_viewport_index)) {
           clipped = draw_pt_post_vs_run( fpme->post_vs, vert_info, prim_info );
        }
+      /* "clipped" also includes non-one edgeflag */
        if (clipped) {
           opt |= PT_PIPELINE;
        }
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c

index f0d5e0f..3a9101a 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -58,7 +58,7 @@ initialize_vertex_header(struct vertex_header *header)
  {
     header->clipmask = 0;
     header->edgeflag = 1;
-   header->have_clipdist = 0;
+   header->pad = 0;
     header->vertex_id = UNDEFINED_VERTEX_ID;
  }
  
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c

index 20de26f..261bd34 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -275,7 +275,7 @@ void draw_pt_so_emit( struct pt_so_emit *emit,
     emit->generated_primitives = 0;
     emit->input_vertex_stride = input_verts->stride;
     if (emit->use_pre_clip_pos)
-      emit->pre_clip_pos = input_verts->verts->pre_clip_pos;
+      emit->pre_clip_pos = input_verts->verts->clip_pos;
  
     emit->inputs = (const float (*)[4])input_verts->verts->data;
  
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h

index ee11d2f..c7b1afe 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -44,6 +44,7 @@
  #include "util/u_debug.h"
  #include "util/u_memory.h"
  
+#define DRAW_ATTR_NONEXIST 255
  
  /**
   * Vertex attribute emit modes
@@ -61,18 +62,6 @@ enum attrib_emit {
  
  
  /**
- * Attribute interpolation mode
- */
-enum interp_mode {
-   INTERP_NONE,      /**< never interpolate vertex header info */
-   INTERP_POS,       /**< special case for frag position */
-   INTERP_CONSTANT,
-   INTERP_LINEAR,
-   INTERP_PERSPECTIVE
-};
-
-
-/**
   * Information about hardware/rasterization vertex layout.
   */
  struct vertex_info
@@ -85,8 +74,7 @@ struct vertex_info
      * memcmp() comparisons.
      */
     struct {
-      unsigned interp_mode:4;      /**< INTERP_x */
-      unsigned emit:4;             /**< EMIT_x */
+      unsigned emit:8;             /**< EMIT_x */
        unsigned src_index:8;          /**< map to post-xform attribs */
     } attrib[PIPE_MAX_SHADER_OUTPUTS];
  };
@@ -124,20 +112,18 @@ draw_vinfo_copy( struct vertex_info *dst,
  static inline uint
  draw_emit_vertex_attr(struct vertex_info *vinfo,
                        enum attrib_emit emit, 
-                      enum interp_mode interp, /* only used by softpipe??? */
                        int src_index)
  {
     const uint n = vinfo->num_attribs;
  
     /* If the src_index is negative, meaning it hasn't been found
-    * lets just redirect it to the first output slot */
+    * we'll assign it all zeros later - set to DRAW_ATTR_NONEXIST */
     if (src_index < 0) {
-      src_index = 0;
+      src_index = DRAW_ATTR_NONEXIST;
     }
  
     assert(n < Elements(vinfo->attrib));
     vinfo->attrib[n].emit = emit;
-   vinfo->attrib[n].interp_mode = interp;
     vinfo->attrib[n].src_index = src_index;
     vinfo->num_attribs++;
     return n;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c

index 1424447..7854142 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -458,7 +458,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
     {
        /* Special case 4x4f --> 1x16ub */
        if (src_type.length == 4 &&
-          util_cpu_caps.has_sse2)
+            (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
        {
           num_dsts = (num_srcs + 3) / 4;
           dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
@@ -545,7 +545,7 @@ lp_build_conv(struct gallivm_state *gallivm,
         ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
          (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
  
-       util_cpu_caps.has_sse2)
+       (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
     {
        struct lp_build_context bld;
        struct lp_type int16_type, int32_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h

index ad64ae0..4598db8 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -136,6 +136,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
     case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
        return 0;
     case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
        return 32;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c

index b1aef71..f571838 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -720,7 +720,7 @@ lp_build_transpose_aos_n(struct gallivm_state *gallivm,
  
        default:
           assert(0);
-   };
+   }
  }
  
  
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c

index 09c1b37..8c39ab0 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1197,7 +1197,7 @@ get_soa_array_offsets(struct lp_build_context *uint_bld,
  
     if (need_perelement_offset) {
        LLVMValueRef pixel_offsets;
-      int i;
+      unsigned i;
       /* build pixel offset vector: {0, 1, 2, 3, ...} */
        pixel_offsets = uint_bld->undef;
        for (i = 0; i < uint_bld->type.length; i++) {
@@ -1809,7 +1809,7 @@ emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
     struct gallivm_state *gallivm = bld_base->base.gallivm;
     LLVMBuilderRef builder = gallivm->builder;
     struct lp_build_context *float_bld = &bld_base->base;
-   int i;
+   unsigned i;
     LLVMValueRef temp, temp2;
     LLVMValueRef shuffles[8];
     LLVMValueRef shuffles2[8];
@@ -2713,7 +2713,7 @@ static boolean
  near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
                     int pc)
  {
-   int i;
+   unsigned i;
  
     for (i = 0; i < 5; i++) {
        unsigned opcode;
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c

index c5c3332..75afebe 100644 (file)
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -431,7 +431,7 @@ hud_alloc_vertices(struct hud_context *hud, struct vertex_queue *v,
     v->max_num_vertices = num_vertices;
     v->vbuf.stride = stride;
     u_upload_alloc(hud->uploader, 0, v->vbuf.stride * v->max_num_vertices,
-                  &v->vbuf.buffer_offset, &v->vbuf.buffer,
+                  16, &v->vbuf.buffer_offset, &v->vbuf.buffer,
                    (void**)&v->vertices);
  }
  
@@ -1176,8 +1176,8 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
  
     hud->pipe = pipe;
     hud->cso = cso;
-   hud->uploader = u_upload_create(pipe, 256 * 1024, 16,
-                                   PIPE_BIND_VERTEX_BUFFER);
+   hud->uploader = u_upload_create(pipe, 256 * 1024,
+                                   PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM);
  
     /* font */
     if (!util_font_create(pipe, UTIL_FONT_FIXED_8X13, &hud->font)) {
diff --git a/src/gallium/auxiliary/indices/u_primconvert.c b/src/gallium/auxiliary/indices/u_primconvert.c

index 70d3e85..5effd88 100644 (file)
--- a/src/gallium/auxiliary/indices/u_primconvert.c
+++ b/src/gallium/auxiliary/indices/u_primconvert.c
@@ -153,10 +153,11 @@ util_primconvert_draw_vbo(struct primconvert_context *pc,
     }
  
     if (!pc->upload) {
-      pc->upload = u_upload_create(pc->pipe, 4096, 4, PIPE_BIND_INDEX_BUFFER);
+      pc->upload = u_upload_create(pc->pipe, 4096, PIPE_BIND_INDEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
     }
  
-   u_upload_alloc(pc->upload, 0, new_ib.index_size * new_info.count,
+   u_upload_alloc(pc->upload, 0, new_ib.index_size * new_info.count, 4,
                    &new_ib.offset, &new_ib.buffer, &dst);
  
     if (info->indexed) {
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c

index 5def6d3..7c57759 100644 (file)
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -22,10 +22,6 @@
   * IN THE SOFTWARE.
   */
  
-#ifdef __GNUC__
-#pragma GCC diagnostic ignored "-Wdeclaration-after-statement"
-#endif
-
  #include "util/ralloc.h"
  #include "glsl/nir/nir.h"
  #include "glsl/nir/nir_control_flow.h"
@@ -1069,7 +1065,9 @@ ttn_kill(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
  static void
  ttn_kill_if(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
  {
-   nir_ssa_def *cmp = nir_bany4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)));
+   nir_ssa_def *cmp = nir_bany_inequal4(b, nir_flt(b, src[0],
+                                                   nir_imm_float(b, 0.0)),
+                                        nir_imm_int(b, 0));
     nir_intrinsic_instr *discard =
        nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
     discard->src[0] = nir_src_for_ssa(cmp);
@@ -1901,6 +1899,7 @@ ttn_emit_instruction(struct ttn_compile *c)
                                             &tgsi_dst->Indirect : NULL;
  
        store->num_components = 4;
+      store->const_index[0] = 0xf;
        store->variables[0] = ttn_array_deref(c, store, var, offset, indirect);
        store->src[0] = nir_src_for_reg(dest.dest.reg.reg);
  
@@ -1951,7 +1950,7 @@ tgsi_processor_to_shader_stage(unsigned processor)
     case TGSI_PROCESSOR_COMPUTE:   return MESA_SHADER_COMPUTE;
     default:
        unreachable("invalid TGSI processor");
-   };
+   }
  }
  
  struct nir_shader *
@@ -1969,15 +1968,10 @@ tgsi_to_nir(const void *tgsi_tokens,
     tgsi_scan_shader(tgsi_tokens, &scan);
     c->scan = &scan;
  
-   s = nir_shader_create(NULL, tgsi_processor_to_shader_stage(scan.processor),
-                         options);
-
-   nir_function *func = nir_function_create(s, "main");
-   nir_function_overload *overload = nir_function_overload_create(func);
-   nir_function_impl *impl = nir_function_impl_create(overload);
-
-   nir_builder_init(&c->build, impl);
-   c->build.cursor = nir_after_cf_list(&impl->body);
+   nir_builder_init_simple_shader(&c->build, NULL,
+                                  tgsi_processor_to_shader_stage(scan.processor),
+                                  options);
+   s = c->build.shader;
  
     s->num_inputs = scan.file_max[TGSI_FILE_INPUT] + 1;
     s->num_uniforms = scan.const_file_max[0] + 1;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c

index fdb7feb..ea20746 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -259,36 +259,39 @@ tgsi_build_declaration_semantic(
     return ds;
  }
  
-static struct tgsi_declaration_resource
-tgsi_default_declaration_resource(void)
+static struct tgsi_declaration_image
+tgsi_default_declaration_image(void)
  {
-   struct tgsi_declaration_resource dr;
+   struct tgsi_declaration_image di;
  
-   dr.Resource = TGSI_TEXTURE_BUFFER;
-   dr.Raw = 0;
-   dr.Writable = 0;
-   dr.Padding = 0;
+   di.Resource = TGSI_TEXTURE_BUFFER;
+   di.Raw = 0;
+   di.Writable = 0;
+   di.Format = 0;
+   di.Padding = 0;
  
-   return dr;
+   return di;
  }
  
-static struct tgsi_declaration_resource
-tgsi_build_declaration_resource(unsigned texture,
-                                unsigned raw,
-                                unsigned writable,
-                                struct tgsi_declaration *declaration,
-                                struct tgsi_header *header)
+static struct tgsi_declaration_image
+tgsi_build_declaration_image(unsigned texture,
+                             unsigned format,
+                             unsigned raw,
+                             unsigned writable,
+                             struct tgsi_declaration *declaration,
+                             struct tgsi_header *header)
  {
-   struct tgsi_declaration_resource dr;
+   struct tgsi_declaration_image di;
  
-   dr = tgsi_default_declaration_resource();
-   dr.Resource = texture;
-   dr.Raw = raw;
-   dr.Writable = writable;
+   di = tgsi_default_declaration_image();
+   di.Resource = texture;
+   di.Format = format;
+   di.Raw = raw;
+   di.Writable = writable;
  
     declaration_grow(declaration, header);
  
-   return dr;
+   return di;
  }
  
  static struct tgsi_declaration_sampler_view
@@ -364,7 +367,7 @@ tgsi_default_full_declaration( void )
     full_declaration.Range = tgsi_default_declaration_range();
     full_declaration.Semantic = tgsi_default_declaration_semantic();
     full_declaration.Interp = tgsi_default_declaration_interp();
-   full_declaration.Resource = tgsi_default_declaration_resource();
+   full_declaration.Image = tgsi_default_declaration_image();
     full_declaration.SamplerView = tgsi_default_declaration_sampler_view();
     full_declaration.Array = tgsi_default_declaration_array();
  
@@ -454,20 +457,21 @@ tgsi_build_full_declaration(
           header );
     }
  
-   if (full_decl->Declaration.File == TGSI_FILE_RESOURCE) {
-      struct tgsi_declaration_resource *dr;
+   if (full_decl->Declaration.File == TGSI_FILE_IMAGE) {
+      struct tgsi_declaration_image *di;
  
        if (maxsize <= size) {
           return  0;
        }
-      dr = (struct tgsi_declaration_resource *)&tokens[size];
+      di = (struct tgsi_declaration_image *)&tokens[size];
        size++;
  
-      *dr = tgsi_build_declaration_resource(full_decl->Resource.Resource,
-                                            full_decl->Resource.Raw,
-                                            full_decl->Resource.Writable,
-                                            declaration,
-                                            header);
+      *di = tgsi_build_declaration_image(full_decl->Image.Resource,
+                                         full_decl->Image.Format,
+                                         full_decl->Image.Raw,
+                                         full_decl->Image.Writable,
+                                         declaration,
+                                         header);
     }
  
     if (full_decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
@@ -616,7 +620,8 @@ tgsi_default_instruction( void )
     instruction.NumSrcRegs = 1;
     instruction.Label = 0;
     instruction.Texture = 0;
-   instruction.Padding  = 0;
+   instruction.Memory = 0;
+   instruction.Padding = 0;
  
     return instruction;
  }
@@ -762,6 +767,34 @@ tgsi_build_instruction_texture(
     return instruction_texture;
  }
  
+static struct tgsi_instruction_memory
+tgsi_default_instruction_memory( void )
+{
+   struct tgsi_instruction_memory instruction_memory;
+
+   instruction_memory.Qualifier = 0;
+   instruction_memory.Padding = 0;
+
+   return instruction_memory;
+}
+
+static struct tgsi_instruction_memory
+tgsi_build_instruction_memory(
+   unsigned qualifier,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_memory instruction_memory;
+
+   instruction_memory.Qualifier = qualifier;
+   instruction_memory.Padding = 0;
+   instruction->Memory = 1;
+
+   instruction_grow( instruction, header );
+
+   return instruction_memory;
+}
  
  static struct tgsi_texture_offset
  tgsi_default_texture_offset( void )
@@ -1008,6 +1041,7 @@ tgsi_default_full_instruction( void )
     full_instruction.Predicate = tgsi_default_instruction_predicate();
     full_instruction.Label = tgsi_default_instruction_label();
     full_instruction.Texture = tgsi_default_instruction_texture();
+   full_instruction.Memory = tgsi_default_instruction_memory();
     for( i = 0;  i < TGSI_FULL_MAX_TEX_OFFSETS; i++ ) {
        full_instruction.TexOffsets[i] = tgsi_default_texture_offset();
     }
@@ -1119,6 +1153,24 @@ tgsi_build_full_instruction(
           prev_token = (struct tgsi_token *) texture_offset;
        }
     }
+
+   if (full_inst->Instruction.Memory) {
+      struct tgsi_instruction_memory *instruction_memory;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_memory =
+         (struct  tgsi_instruction_memory *) &tokens[size];
+      size++;
+
+      *instruction_memory = tgsi_build_instruction_memory(
+         full_inst->Memory.Qualifier,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_memory;
+   }
+
     for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
        const struct tgsi_full_dst_register *reg = &full_inst->Dst[i];
        struct tgsi_dst_register *dst_register;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c

index e29ffb3..2ad29b9 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -348,15 +348,22 @@ iter_declaration(
        }
     }
  
-   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
+   if (decl->Declaration.File == TGSI_FILE_IMAGE) {
        TXT(", ");
-      ENM(decl->Resource.Resource, tgsi_texture_names);
-      if (decl->Resource.Writable)
+      ENM(decl->Image.Resource, tgsi_texture_names);
+      TXT(", ");
+      UID(decl->Image.Format);
+      if (decl->Image.Writable)
           TXT(", WR");
-      if (decl->Resource.Raw)
+      if (decl->Image.Raw)
           TXT(", RAW");
     }
  
+   if (decl->Declaration.File == TGSI_FILE_BUFFER) {
+      if (decl->Declaration.Atomic)
+         TXT(", ATOMIC");
+   }
+
     if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
        TXT(", ");
        ENM(decl->SamplerView.Resource, tgsi_texture_names);
@@ -617,6 +624,16 @@ iter_instruction(
        }
     }
  
+   if (inst->Instruction.Memory) {
+      uint32_t qualifier = inst->Memory.Qualifier;
+      while (qualifier) {
+         int bit = ffs(qualifier) - 1;
+         qualifier &= ~(1U << bit);
+         TXT(", ");
+         ENM(bit, tgsi_memory_names);
+      }
+   }
+
     switch (inst->Instruction.Opcode) {
     case TGSI_OPCODE_IF:
     case TGSI_OPCODE_UIF:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h

index f86adce..26fec8e 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -473,6 +473,7 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
        return 1;
     case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
        return 0;
     case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
        return 32;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c

index 3b40c3d..b270dd7 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -37,231 +37,231 @@
  
  static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
  {
-   { 1, 1, 0, 0, 0, 0, COMP, "ARL", TGSI_OPCODE_ARL },
-   { 1, 1, 0, 0, 0, 0, COMP, "MOV", TGSI_OPCODE_MOV },
-   { 1, 1, 0, 0, 0, 0, CHAN, "LIT", TGSI_OPCODE_LIT },
-   { 1, 1, 0, 0, 0, 0, REPL, "RCP", TGSI_OPCODE_RCP },
-   { 1, 1, 0, 0, 0, 0, REPL, "RSQ", TGSI_OPCODE_RSQ },
-   { 1, 1, 0, 0, 0, 0, CHAN, "EXP", TGSI_OPCODE_EXP },
-   { 1, 1, 0, 0, 0, 0, CHAN, "LOG", TGSI_OPCODE_LOG },
-   { 1, 2, 0, 0, 0, 0, COMP, "MUL", TGSI_OPCODE_MUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "ADD", TGSI_OPCODE_ADD },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP3", TGSI_OPCODE_DP3 },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP4", TGSI_OPCODE_DP4 },
-   { 1, 2, 0, 0, 0, 0, CHAN, "DST", TGSI_OPCODE_DST },
-   { 1, 2, 0, 0, 0, 0, COMP, "MIN", TGSI_OPCODE_MIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "MAX", TGSI_OPCODE_MAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "SLT", TGSI_OPCODE_SLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "SGE", TGSI_OPCODE_SGE },
-   { 1, 3, 0, 0, 0, 0, COMP, "MAD", TGSI_OPCODE_MAD },
-   { 1, 2, 0, 0, 0, 0, COMP, "SUB", TGSI_OPCODE_SUB },
-   { 1, 3, 0, 0, 0, 0, COMP, "LRP", TGSI_OPCODE_LRP },
-   { 1, 3, 0, 0, 0, 0, COMP, "FMA", TGSI_OPCODE_FMA },
-   { 1, 1, 0, 0, 0, 0, REPL, "SQRT", TGSI_OPCODE_SQRT },
-   { 1, 3, 0, 0, 0, 0, REPL, "DP2A", TGSI_OPCODE_DP2A },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 22 },      /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "", 23 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "FRC", TGSI_OPCODE_FRC },
-   { 1, 3, 0, 0, 0, 0, COMP, "CLAMP", TGSI_OPCODE_CLAMP },
-   { 1, 1, 0, 0, 0, 0, COMP, "FLR", TGSI_OPCODE_FLR },
-   { 1, 1, 0, 0, 0, 0, COMP, "ROUND", TGSI_OPCODE_ROUND },
-   { 1, 1, 0, 0, 0, 0, REPL, "EX2", TGSI_OPCODE_EX2 },
-   { 1, 1, 0, 0, 0, 0, REPL, "LG2", TGSI_OPCODE_LG2 },
-   { 1, 2, 0, 0, 0, 0, REPL, "POW", TGSI_OPCODE_POW },
-   { 1, 2, 0, 0, 0, 0, COMP, "XPD", TGSI_OPCODE_XPD },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 32 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "ABS", TGSI_OPCODE_ABS },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 34 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, REPL, "DPH", TGSI_OPCODE_DPH },
-   { 1, 1, 0, 0, 0, 0, REPL, "COS", TGSI_OPCODE_COS },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
-   { 0, 0, 0, 0, 0, 0, NONE, "KILL", TGSI_OPCODE_KILL },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK2H", TGSI_OPCODE_PK2H },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK2US", TGSI_OPCODE_PK2US },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK4B", TGSI_OPCODE_PK4B },
-   { 1, 1, 0, 0, 0, 0, COMP, "PK4UB", TGSI_OPCODE_PK4UB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 44 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "SEQ", TGSI_OPCODE_SEQ },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 46 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "SGT", TGSI_OPCODE_SGT },
-   { 1, 1, 0, 0, 0, 0, REPL, "SIN", TGSI_OPCODE_SIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "SLE", TGSI_OPCODE_SLE },
-   { 1, 2, 0, 0, 0, 0, COMP, "SNE", TGSI_OPCODE_SNE },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 51 },      /* removed */
-   { 1, 2, 1, 0, 0, 0, OTHR, "TEX", TGSI_OPCODE_TEX },
-   { 1, 4, 1, 0, 0, 0, OTHR, "TXD", TGSI_OPCODE_TXD },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXP", TGSI_OPCODE_TXP },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP2H", TGSI_OPCODE_UP2H },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP2US", TGSI_OPCODE_UP2US },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP4B", TGSI_OPCODE_UP4B },
-   { 1, 1, 0, 0, 0, 0, COMP, "UP4UB", TGSI_OPCODE_UP4UB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 59 },      /* removed */
-   { 0, 1, 0, 0, 0, 1, NONE, "", 60 },      /* removed */
-   { 1, 1, 0, 0, 0, 0, COMP, "ARR", TGSI_OPCODE_ARR },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 62 },      /* removed */
-   { 0, 0, 0, 1, 0, 0, NONE, "CAL", TGSI_OPCODE_CAL },
-   { 0, 0, 0, 0, 0, 0, NONE, "RET", TGSI_OPCODE_RET },
-   { 1, 1, 0, 0, 0, 0, COMP, "SSG", TGSI_OPCODE_SSG },
-   { 1, 3, 0, 0, 0, 0, COMP, "CMP", TGSI_OPCODE_CMP },
-   { 1, 1, 0, 0, 0, 0, CHAN, "SCS", TGSI_OPCODE_SCS },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXB", TGSI_OPCODE_TXB },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 69 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "DIV", TGSI_OPCODE_DIV },
-   { 1, 2, 0, 0, 0, 0, REPL, "DP2", TGSI_OPCODE_DP2 },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXL", TGSI_OPCODE_TXL },
-   { 0, 0, 0, 0, 0, 0, NONE, "BRK", TGSI_OPCODE_BRK },
-   { 0, 1, 0, 1, 0, 1, NONE, "IF", TGSI_OPCODE_IF },
-   { 0, 1, 0, 1, 0, 1, NONE, "UIF", TGSI_OPCODE_UIF },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 76 },      /* removed */
-   { 0, 0, 0, 1, 1, 1, NONE, "ELSE", TGSI_OPCODE_ELSE },
-   { 0, 0, 0, 0, 1, 0, NONE, "ENDIF", TGSI_OPCODE_ENDIF },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDX_FINE", TGSI_OPCODE_DDX_FINE },
-   { 1, 1, 0, 0, 0, 0, COMP, "DDY_FINE", TGSI_OPCODE_DDY_FINE },
-   { 0, 1, 0, 0, 0, 0, NONE, "PUSHA", TGSI_OPCODE_PUSHA },
-   { 1, 0, 0, 0, 0, 0, NONE, "POPA", TGSI_OPCODE_POPA },
-   { 1, 1, 0, 0, 0, 0, COMP, "CEIL", TGSI_OPCODE_CEIL },
-   { 1, 1, 0, 0, 0, 0, COMP, "I2F", TGSI_OPCODE_I2F },
-   { 1, 1, 0, 0, 0, 0, COMP, "NOT", TGSI_OPCODE_NOT },
-   { 1, 1, 0, 0, 0, 0, COMP, "TRUNC", TGSI_OPCODE_TRUNC },
-   { 1, 2, 0, 0, 0, 0, COMP, "SHL", TGSI_OPCODE_SHL },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 88 },      /* removed */
-   { 1, 2, 0, 0, 0, 0, COMP, "AND", TGSI_OPCODE_AND },
-   { 1, 2, 0, 0, 0, 0, COMP, "OR", TGSI_OPCODE_OR },
-   { 1, 2, 0, 0, 0, 0, COMP, "MOD", TGSI_OPCODE_MOD },
-   { 1, 2, 0, 0, 0, 0, COMP, "XOR", TGSI_OPCODE_XOR },
-   { 1, 3, 0, 0, 0, 0, COMP, "SAD", TGSI_OPCODE_SAD },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXF", TGSI_OPCODE_TXF },
-   { 1, 2, 1, 0, 0, 0, OTHR, "TXQ", TGSI_OPCODE_TXQ },
-   { 0, 0, 0, 0, 0, 0, NONE, "CONT", TGSI_OPCODE_CONT },
-   { 0, 1, 0, 0, 0, 0, NONE, "EMIT", TGSI_OPCODE_EMIT },
-   { 0, 1, 0, 0, 0, 0, NONE, "ENDPRIM", TGSI_OPCODE_ENDPRIM },
-   { 0, 0, 0, 1, 0, 1, NONE, "BGNLOOP", TGSI_OPCODE_BGNLOOP },
-   { 0, 0, 0, 0, 0, 1, NONE, "BGNSUB", TGSI_OPCODE_BGNSUB },
-   { 0, 0, 0, 1, 1, 0, NONE, "ENDLOOP", TGSI_OPCODE_ENDLOOP },
-   { 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB },
-   { 1, 1, 1, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ },
-   { 1, 1, 1, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS },
-   { 0, 0, 0, 0, 0, 0, NONE, "", 105 },     /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "", 106 },     /* removed */
-   { 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSGE", TGSI_OPCODE_FSGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSLT", TGSI_OPCODE_FSLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "FSNE", TGSI_OPCODE_FSNE },
-   { 0, 1, 0, 0, 0, 1, NONE, "", 112 },      /* removed */
-   { 0, 1, 0, 0, 0, 0, NONE, "CALLNZ", TGSI_OPCODE_CALLNZ },
-   { 0, 1, 0, 0, 0, 0, NONE, "", 114 },     /* removed */
-   { 0, 1, 0, 0, 0, 0, NONE, "BREAKC", TGSI_OPCODE_BREAKC },
-   { 0, 1, 0, 0, 0, 0, NONE, "KILL_IF", TGSI_OPCODE_KILL_IF },
-   { 0, 0, 0, 0, 0, 0, NONE, "END", TGSI_OPCODE_END },
-   { 1, 3, 0, 0, 0, 0, COMP, "DFMA", TGSI_OPCODE_DFMA },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2I", TGSI_OPCODE_F2I },
-   { 1, 2, 0, 0, 0, 0, COMP, "IDIV", TGSI_OPCODE_IDIV },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMAX", TGSI_OPCODE_IMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMIN", TGSI_OPCODE_IMIN },
-   { 1, 1, 0, 0, 0, 0, COMP, "INEG", TGSI_OPCODE_INEG },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISGE", TGSI_OPCODE_ISGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISHR", TGSI_OPCODE_ISHR },
-   { 1, 2, 0, 0, 0, 0, COMP, "ISLT", TGSI_OPCODE_ISLT },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2U", TGSI_OPCODE_F2U },
-   { 1, 1, 0, 0, 0, 0, COMP, "U2F", TGSI_OPCODE_U2F },
-   { 1, 2, 0, 0, 0, 0, COMP, "UADD", TGSI_OPCODE_UADD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UDIV", TGSI_OPCODE_UDIV },
-   { 1, 3, 0, 0, 0, 0, COMP, "UMAD", TGSI_OPCODE_UMAD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMAX", TGSI_OPCODE_UMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMIN", TGSI_OPCODE_UMIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMOD", TGSI_OPCODE_UMOD },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMUL", TGSI_OPCODE_UMUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "USEQ", TGSI_OPCODE_USEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "USGE", TGSI_OPCODE_USGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "USHR", TGSI_OPCODE_USHR },
-   { 1, 2, 0, 0, 0, 0, COMP, "USLT", TGSI_OPCODE_USLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "USNE", TGSI_OPCODE_USNE },
-   { 0, 1, 0, 0, 0, 0, NONE, "SWITCH", TGSI_OPCODE_SWITCH },
-   { 0, 1, 0, 0, 0, 0, NONE, "CASE", TGSI_OPCODE_CASE },
-   { 0, 0, 0, 0, 0, 0, NONE, "DEFAULT", TGSI_OPCODE_DEFAULT },
-   { 0, 0, 0, 0, 0, 0, NONE, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ARL", TGSI_OPCODE_ARL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "MOV", TGSI_OPCODE_MOV },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "LIT", TGSI_OPCODE_LIT },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "RCP", TGSI_OPCODE_RCP },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "RSQ", TGSI_OPCODE_RSQ },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "EXP", TGSI_OPCODE_EXP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "LOG", TGSI_OPCODE_LOG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MUL", TGSI_OPCODE_MUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ADD", TGSI_OPCODE_ADD },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP3", TGSI_OPCODE_DP3 },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP4", TGSI_OPCODE_DP4 },
+   { 1, 2, 0, 0, 0, 0, 0, CHAN, "DST", TGSI_OPCODE_DST },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MIN", TGSI_OPCODE_MIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MAX", TGSI_OPCODE_MAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SLT", TGSI_OPCODE_SLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SGE", TGSI_OPCODE_SGE },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "MAD", TGSI_OPCODE_MAD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SUB", TGSI_OPCODE_SUB },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "LRP", TGSI_OPCODE_LRP },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "FMA", TGSI_OPCODE_FMA },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "SQRT", TGSI_OPCODE_SQRT },
+   { 1, 3, 0, 0, 0, 0, 0, REPL, "DP2A", TGSI_OPCODE_DP2A },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 22 },      /* removed */
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 23 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "FRC", TGSI_OPCODE_FRC },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "CLAMP", TGSI_OPCODE_CLAMP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "FLR", TGSI_OPCODE_FLR },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ROUND", TGSI_OPCODE_ROUND },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "EX2", TGSI_OPCODE_EX2 },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "LG2", TGSI_OPCODE_LG2 },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "POW", TGSI_OPCODE_POW },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "XPD", TGSI_OPCODE_XPD },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 32 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ABS", TGSI_OPCODE_ABS },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 34 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DPH", TGSI_OPCODE_DPH },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "COS", TGSI_OPCODE_COS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "KILL", TGSI_OPCODE_KILL },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK2H", TGSI_OPCODE_PK2H },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK2US", TGSI_OPCODE_PK2US },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK4B", TGSI_OPCODE_PK4B },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "PK4UB", TGSI_OPCODE_PK4UB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 44 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SEQ", TGSI_OPCODE_SEQ },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 46 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SGT", TGSI_OPCODE_SGT },
+   { 1, 1, 0, 0, 0, 0, 0, REPL, "SIN", TGSI_OPCODE_SIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SLE", TGSI_OPCODE_SLE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SNE", TGSI_OPCODE_SNE },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 51 },      /* removed */
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TEX", TGSI_OPCODE_TEX },
+   { 1, 4, 1, 0, 0, 0, 0, OTHR, "TXD", TGSI_OPCODE_TXD },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXP", TGSI_OPCODE_TXP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP2H", TGSI_OPCODE_UP2H },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP2US", TGSI_OPCODE_UP2US },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP4B", TGSI_OPCODE_UP4B },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "UP4UB", TGSI_OPCODE_UP4UB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 59 },      /* removed */
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 60 },      /* removed */
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ARR", TGSI_OPCODE_ARR },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 62 },      /* removed */
+   { 0, 0, 0, 0, 1, 0, 0, NONE, "CAL", TGSI_OPCODE_CAL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "RET", TGSI_OPCODE_RET },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "SSG", TGSI_OPCODE_SSG },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "CMP", TGSI_OPCODE_CMP },
+   { 1, 1, 0, 0, 0, 0, 0, CHAN, "SCS", TGSI_OPCODE_SCS },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXB", TGSI_OPCODE_TXB },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 69 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DIV", TGSI_OPCODE_DIV },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "DP2", TGSI_OPCODE_DP2 },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXL", TGSI_OPCODE_TXL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "BRK", TGSI_OPCODE_BRK },
+   { 0, 1, 0, 0, 1, 0, 1, NONE, "IF", TGSI_OPCODE_IF },
+   { 0, 1, 0, 0, 1, 0, 1, NONE, "UIF", TGSI_OPCODE_UIF },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 76 },      /* removed */
+   { 0, 0, 0, 0, 1, 1, 1, NONE, "ELSE", TGSI_OPCODE_ELSE },
+   { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDIF", TGSI_OPCODE_ENDIF },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDX_FINE", TGSI_OPCODE_DDX_FINE },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DDY_FINE", TGSI_OPCODE_DDY_FINE },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "PUSHA", TGSI_OPCODE_PUSHA },
+   { 1, 0, 0, 0, 0, 0, 0, NONE, "POPA", TGSI_OPCODE_POPA },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "CEIL", TGSI_OPCODE_CEIL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "I2F", TGSI_OPCODE_I2F },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "NOT", TGSI_OPCODE_NOT },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "TRUNC", TGSI_OPCODE_TRUNC },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "SHL", TGSI_OPCODE_SHL },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 88 },      /* removed */
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "AND", TGSI_OPCODE_AND },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "OR", TGSI_OPCODE_OR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "MOD", TGSI_OPCODE_MOD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "XOR", TGSI_OPCODE_XOR },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "SAD", TGSI_OPCODE_SAD },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXF", TGSI_OPCODE_TXF },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXQ", TGSI_OPCODE_TXQ },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "CONT", TGSI_OPCODE_CONT },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "EMIT", TGSI_OPCODE_EMIT },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "ENDPRIM", TGSI_OPCODE_ENDPRIM },
+   { 0, 0, 0, 0, 1, 0, 1, NONE, "BGNLOOP", TGSI_OPCODE_BGNLOOP },
+   { 0, 0, 0, 0, 0, 0, 1, NONE, "BGNSUB", TGSI_OPCODE_BGNSUB },
+   { 0, 0, 0, 0, 1, 1, 0, NONE, "ENDLOOP", TGSI_OPCODE_ENDLOOP },
+   { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB },
+   { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ },
+   { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS },
+   { 1, 1, 0, 0, 0, 0, 0, NONE, "RESQ", TGSI_OPCODE_RESQ },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "", 106 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSGE", TGSI_OPCODE_FSGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSLT", TGSI_OPCODE_FSLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "FSNE", TGSI_OPCODE_FSNE },
+   { 0, 1, 0, 0, 0, 0, 1, NONE, "", 112 },      /* removed */
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "CALLNZ", TGSI_OPCODE_CALLNZ },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "", 114 },     /* removed */
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "BREAKC", TGSI_OPCODE_BREAKC },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "KILL_IF", TGSI_OPCODE_KILL_IF },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "END", TGSI_OPCODE_END },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "DFMA", TGSI_OPCODE_DFMA },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2I", TGSI_OPCODE_F2I },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IDIV", TGSI_OPCODE_IDIV },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMAX", TGSI_OPCODE_IMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMIN", TGSI_OPCODE_IMIN },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "INEG", TGSI_OPCODE_INEG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISGE", TGSI_OPCODE_ISGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISHR", TGSI_OPCODE_ISHR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "ISLT", TGSI_OPCODE_ISLT },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2U", TGSI_OPCODE_F2U },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "U2F", TGSI_OPCODE_U2F },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UADD", TGSI_OPCODE_UADD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UDIV", TGSI_OPCODE_UDIV },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UMAD", TGSI_OPCODE_UMAD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMAX", TGSI_OPCODE_UMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMIN", TGSI_OPCODE_UMIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMOD", TGSI_OPCODE_UMOD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMUL", TGSI_OPCODE_UMUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USEQ", TGSI_OPCODE_USEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USGE", TGSI_OPCODE_USGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USHR", TGSI_OPCODE_USHR },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USLT", TGSI_OPCODE_USLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "USNE", TGSI_OPCODE_USNE },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "SWITCH", TGSI_OPCODE_SWITCH },
+   { 0, 1, 0, 0, 0, 0, 0, NONE, "CASE", TGSI_OPCODE_CASE },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "DEFAULT", TGSI_OPCODE_DEFAULT },
+   { 0, 0, 0, 0, 0, 0, 0, NONE, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH },
  
-   { 1, 3, 0, 0, 0, 0, OTHR, "SAMPLE",      TGSI_OPCODE_SAMPLE },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_I",    TGSI_OPCODE_SAMPLE_I },
-   { 1, 3, 0, 0, 0, 0, OTHR, "SAMPLE_I_MS", TGSI_OPCODE_SAMPLE_I_MS },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_B",    TGSI_OPCODE_SAMPLE_B },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_C",    TGSI_OPCODE_SAMPLE_C },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_C_LZ", TGSI_OPCODE_SAMPLE_C_LZ },
-   { 1, 5, 0, 0, 0, 0, OTHR, "SAMPLE_D",    TGSI_OPCODE_SAMPLE_D },
-   { 1, 4, 0, 0, 0, 0, OTHR, "SAMPLE_L",    TGSI_OPCODE_SAMPLE_L },
-   { 1, 3, 0, 0, 0, 0, OTHR, "GATHER4",     TGSI_OPCODE_GATHER4 },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SVIEWINFO",   TGSI_OPCODE_SVIEWINFO },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_POS",  TGSI_OPCODE_SAMPLE_POS },
-   { 1, 2, 0, 0, 0, 0, OTHR, "SAMPLE_INFO", TGSI_OPCODE_SAMPLE_INFO },
-   { 1, 1, 0, 0, 0, 0, COMP, "UARL", TGSI_OPCODE_UARL },
-   { 1, 3, 0, 0, 0, 0, COMP, "UCMP", TGSI_OPCODE_UCMP },
-   { 1, 1, 0, 0, 0, 0, COMP, "IABS", TGSI_OPCODE_IABS },
-   { 1, 1, 0, 0, 0, 0, COMP, "ISSG", TGSI_OPCODE_ISSG },
-   { 1, 2, 0, 0, 0, 0, OTHR, "LOAD", TGSI_OPCODE_LOAD },
-   { 1, 2, 0, 0, 0, 0, OTHR, "STORE", TGSI_OPCODE_STORE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "MFENCE", TGSI_OPCODE_MFENCE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "LFENCE", TGSI_OPCODE_LFENCE },
-   { 1, 0, 0, 0, 0, 0, OTHR, "SFENCE", TGSI_OPCODE_SFENCE },
-   { 0, 0, 0, 0, 0, 0, OTHR, "BARRIER", TGSI_OPCODE_BARRIER },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "SAMPLE",      TGSI_OPCODE_SAMPLE },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_I",    TGSI_OPCODE_SAMPLE_I },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "SAMPLE_I_MS", TGSI_OPCODE_SAMPLE_I_MS },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_B",    TGSI_OPCODE_SAMPLE_B },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_C",    TGSI_OPCODE_SAMPLE_C },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_C_LZ", TGSI_OPCODE_SAMPLE_C_LZ },
+   { 1, 5, 0, 0, 0, 0, 0, OTHR, "SAMPLE_D",    TGSI_OPCODE_SAMPLE_D },
+   { 1, 4, 0, 0, 0, 0, 0, OTHR, "SAMPLE_L",    TGSI_OPCODE_SAMPLE_L },
+   { 1, 3, 0, 0, 0, 0, 0, OTHR, "GATHER4",     TGSI_OPCODE_GATHER4 },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SVIEWINFO",   TGSI_OPCODE_SVIEWINFO },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_POS",  TGSI_OPCODE_SAMPLE_POS },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "SAMPLE_INFO", TGSI_OPCODE_SAMPLE_INFO },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "UARL", TGSI_OPCODE_UARL },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UCMP", TGSI_OPCODE_UCMP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "IABS", TGSI_OPCODE_IABS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "ISSG", TGSI_OPCODE_ISSG },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "LOAD", TGSI_OPCODE_LOAD },
+   { 1, 2, 0, 1, 0, 0, 0, OTHR, "STORE", TGSI_OPCODE_STORE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "MFENCE", TGSI_OPCODE_MFENCE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "LFENCE", TGSI_OPCODE_LFENCE },
+   { 1, 0, 0, 0, 0, 0, 0, OTHR, "SFENCE", TGSI_OPCODE_SFENCE },
+   { 0, 0, 0, 0, 0, 0, 0, OTHR, "BARRIER", TGSI_OPCODE_BARRIER },
  
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUADD", TGSI_OPCODE_ATOMUADD },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMXCHG", TGSI_OPCODE_ATOMXCHG },
-   { 1, 4, 0, 0, 0, 0, OTHR, "ATOMCAS", TGSI_OPCODE_ATOMCAS },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMAND", TGSI_OPCODE_ATOMAND },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMOR", TGSI_OPCODE_ATOMOR },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMXOR", TGSI_OPCODE_ATOMXOR },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUMIN", TGSI_OPCODE_ATOMUMIN },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMUMAX", TGSI_OPCODE_ATOMUMAX },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMIMIN", TGSI_OPCODE_ATOMIMIN },
-   { 1, 3, 0, 0, 0, 0, OTHR, "ATOMIMAX", TGSI_OPCODE_ATOMIMAX },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TEX2", TGSI_OPCODE_TEX2 },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TXB2", TGSI_OPCODE_TXB2 },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TXL2", TGSI_OPCODE_TXL2 },
-   { 1, 2, 0, 0, 0, 0, COMP, "IMUL_HI", TGSI_OPCODE_IMUL_HI },
-   { 1, 2, 0, 0, 0, 0, COMP, "UMUL_HI", TGSI_OPCODE_UMUL_HI },
-   { 1, 3, 1, 0, 0, 0, OTHR, "TG4", TGSI_OPCODE_TG4 },
-   { 1, 2, 1, 0, 0, 0, OTHR, "LODQ", TGSI_OPCODE_LODQ },
-   { 1, 3, 0, 0, 0, 0, COMP, "IBFE", TGSI_OPCODE_IBFE },
-   { 1, 3, 0, 0, 0, 0, COMP, "UBFE", TGSI_OPCODE_UBFE },
-   { 1, 4, 0, 0, 0, 0, COMP, "BFI", TGSI_OPCODE_BFI },
-   { 1, 1, 0, 0, 0, 0, COMP, "BREV", TGSI_OPCODE_BREV },
-   { 1, 1, 0, 0, 0, 0, COMP, "POPC", TGSI_OPCODE_POPC },
-   { 1, 1, 0, 0, 0, 0, COMP, "LSB", TGSI_OPCODE_LSB },
-   { 1, 1, 0, 0, 0, 0, COMP, "IMSB", TGSI_OPCODE_IMSB },
-   { 1, 1, 0, 0, 0, 0, COMP, "UMSB", TGSI_OPCODE_UMSB },
-   { 1, 1, 0, 0, 0, 0, OTHR, "INTERP_CENTROID", TGSI_OPCODE_INTERP_CENTROID },
-   { 1, 2, 0, 0, 0, 0, OTHR, "INTERP_SAMPLE", TGSI_OPCODE_INTERP_SAMPLE },
-   { 1, 2, 0, 0, 0, 0, OTHR, "INTERP_OFFSET", TGSI_OPCODE_INTERP_OFFSET },
-   { 1, 1, 0, 0, 0, 0, COMP, "F2D", TGSI_OPCODE_F2D },
-   { 1, 1, 0, 0, 0, 0, COMP, "D2F", TGSI_OPCODE_D2F },
-   { 1, 1, 0, 0, 0, 0, COMP, "DABS", TGSI_OPCODE_DABS },
-   { 1, 1, 0, 0, 0, 0, COMP, "DNEG", TGSI_OPCODE_DNEG },
-   { 1, 2, 0, 0, 0, 0, COMP, "DADD", TGSI_OPCODE_DADD },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMUL", TGSI_OPCODE_DMUL },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMAX", TGSI_OPCODE_DMAX },
-   { 1, 2, 0, 0, 0, 0, COMP, "DMIN", TGSI_OPCODE_DMIN },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSLT", TGSI_OPCODE_DSLT },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSGE", TGSI_OPCODE_DSGE },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSEQ", TGSI_OPCODE_DSEQ },
-   { 1, 2, 0, 0, 0, 0, COMP, "DSNE", TGSI_OPCODE_DSNE },
-   { 1, 1, 0, 0, 0, 0, COMP, "DRCP", TGSI_OPCODE_DRCP },
-   { 1, 1, 0, 0 ,0, 0, COMP, "DSQRT", TGSI_OPCODE_DSQRT },
-   { 1, 3, 0, 0 ,0, 0, COMP, "DMAD", TGSI_OPCODE_DMAD },
-   { 1, 1, 0, 0, 0, 0, COMP, "DFRAC", TGSI_OPCODE_DFRAC},
-   { 1, 2, 0, 0, 0, 0, COMP, "DLDEXP", TGSI_OPCODE_DLDEXP},
-   { 2, 1, 0, 0, 0, 0, COMP, "DFRACEXP", TGSI_OPCODE_DFRACEXP},
-   { 1, 1, 0, 0, 0, 0, COMP, "D2I", TGSI_OPCODE_D2I },
-   { 1, 1, 0, 0, 0, 0, COMP, "I2D", TGSI_OPCODE_I2D },
-   { 1, 1, 0, 0, 0, 0, COMP, "D2U", TGSI_OPCODE_D2U },
-   { 1, 1, 0, 0, 0, 0, COMP, "U2D", TGSI_OPCODE_U2D },
-   { 1, 1, 0, 0 ,0, 0, COMP, "DRSQ", TGSI_OPCODE_DRSQ },
-   { 1, 1, 0, 0, 0, 0, COMP, "DTRUNC", TGSI_OPCODE_DTRUNC },
-   { 1, 1, 0, 0, 0, 0, COMP, "DCEIL", TGSI_OPCODE_DCEIL },
-   { 1, 1, 0, 0, 0, 0, COMP, "DFLR", TGSI_OPCODE_DFLR },
-   { 1, 1, 0, 0, 0, 0, COMP, "DROUND", TGSI_OPCODE_DROUND },
-   { 1, 1, 0, 0, 0, 0, COMP, "DSSG", TGSI_OPCODE_DSSG },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUADD", TGSI_OPCODE_ATOMUADD },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMXCHG", TGSI_OPCODE_ATOMXCHG },
+   { 1, 4, 0, 1, 0, 0, 0, OTHR, "ATOMCAS", TGSI_OPCODE_ATOMCAS },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMAND", TGSI_OPCODE_ATOMAND },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMOR", TGSI_OPCODE_ATOMOR },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMXOR", TGSI_OPCODE_ATOMXOR },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUMIN", TGSI_OPCODE_ATOMUMIN },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMUMAX", TGSI_OPCODE_ATOMUMAX },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMIMIN", TGSI_OPCODE_ATOMIMIN },
+   { 1, 3, 0, 1, 0, 0, 0, OTHR, "ATOMIMAX", TGSI_OPCODE_ATOMIMAX },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TEX2", TGSI_OPCODE_TEX2 },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TXB2", TGSI_OPCODE_TXB2 },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TXL2", TGSI_OPCODE_TXL2 },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "IMUL_HI", TGSI_OPCODE_IMUL_HI },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "UMUL_HI", TGSI_OPCODE_UMUL_HI },
+   { 1, 3, 1, 0, 0, 0, 0, OTHR, "TG4", TGSI_OPCODE_TG4 },
+   { 1, 2, 1, 0, 0, 0, 0, OTHR, "LODQ", TGSI_OPCODE_LODQ },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "IBFE", TGSI_OPCODE_IBFE },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "UBFE", TGSI_OPCODE_UBFE },
+   { 1, 4, 0, 0, 0, 0, 0, COMP, "BFI", TGSI_OPCODE_BFI },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "BREV", TGSI_OPCODE_BREV },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "POPC", TGSI_OPCODE_POPC },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "LSB", TGSI_OPCODE_LSB },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "IMSB", TGSI_OPCODE_IMSB },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "UMSB", TGSI_OPCODE_UMSB },
+   { 1, 1, 0, 0, 0, 0, 0, OTHR, "INTERP_CENTROID", TGSI_OPCODE_INTERP_CENTROID },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "INTERP_SAMPLE", TGSI_OPCODE_INTERP_SAMPLE },
+   { 1, 2, 0, 0, 0, 0, 0, OTHR, "INTERP_OFFSET", TGSI_OPCODE_INTERP_OFFSET },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "F2D", TGSI_OPCODE_F2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2F", TGSI_OPCODE_D2F },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DABS", TGSI_OPCODE_DABS },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DNEG", TGSI_OPCODE_DNEG },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DADD", TGSI_OPCODE_DADD },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMUL", TGSI_OPCODE_DMUL },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMAX", TGSI_OPCODE_DMAX },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DMIN", TGSI_OPCODE_DMIN },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSLT", TGSI_OPCODE_DSLT },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSGE", TGSI_OPCODE_DSGE },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSEQ", TGSI_OPCODE_DSEQ },
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DSNE", TGSI_OPCODE_DSNE },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DRCP", TGSI_OPCODE_DRCP },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DSQRT", TGSI_OPCODE_DSQRT },
+   { 1, 3, 0, 0, 0, 0, 0, COMP, "DMAD", TGSI_OPCODE_DMAD },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DFRAC", TGSI_OPCODE_DFRAC},
+   { 1, 2, 0, 0, 0, 0, 0, COMP, "DLDEXP", TGSI_OPCODE_DLDEXP},
+   { 2, 1, 0, 0, 0, 0, 0, COMP, "DFRACEXP", TGSI_OPCODE_DFRACEXP},
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2I", TGSI_OPCODE_D2I },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "I2D", TGSI_OPCODE_I2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "D2U", TGSI_OPCODE_D2U },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "U2D", TGSI_OPCODE_U2D },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DRSQ", TGSI_OPCODE_DRSQ },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DTRUNC", TGSI_OPCODE_DTRUNC },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DCEIL", TGSI_OPCODE_DCEIL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DFLR", TGSI_OPCODE_DFLR },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DROUND", TGSI_OPCODE_DROUND },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "DSSG", TGSI_OPCODE_DSSG },
  };
  
  const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h

index aa7edd1..46f03cd 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_info.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.h
@@ -74,6 +74,7 @@ struct tgsi_opcode_info
     unsigned num_dst:3;
     unsigned num_src:3;
     unsigned is_tex:1;
+   unsigned is_store:1;
     unsigned is_branch:1;
     int pre_dedent:2;
     int post_indent:2;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c

index 0729b5d..ae95ebd 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -121,8 +121,8 @@ tgsi_parse_token(
           next_token( ctx, &decl->Semantic );
        }
  
-      if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
-         next_token(ctx, &decl->Resource);
+      if (decl->Declaration.File == TGSI_FILE_IMAGE) {
+         next_token(ctx, &decl->Image);
        }
  
        if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
@@ -195,6 +195,10 @@ tgsi_parse_token(
           }
        }
  
+      if (inst->Instruction.Memory) {
+         next_token(ctx, &inst->Memory);
+      }
+
        assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
  
        for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h

index 35e1c7c..4689fb7 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -64,7 +64,7 @@ struct tgsi_full_declaration
     struct tgsi_declaration_dimension Dim;
     struct tgsi_declaration_interp Interp;
     struct tgsi_declaration_semantic Semantic;
-   struct tgsi_declaration_resource Resource;
+   struct tgsi_declaration_image Image;
     struct tgsi_declaration_sampler_view SamplerView;
     struct tgsi_declaration_array Array;
  };
@@ -91,6 +91,7 @@ struct tgsi_full_instruction
     struct tgsi_instruction_predicate   Predicate;
     struct tgsi_instruction_label       Label;
     struct tgsi_instruction_texture     Texture;
+   struct tgsi_instruction_memory      Memory;
     struct tgsi_full_dst_register       Dst[TGSI_FULL_MAX_DST_REGISTERS];
     struct tgsi_full_src_register       Src[TGSI_FULL_MAX_SRC_REGISTERS];
     struct tgsi_texture_offset          TexOffsets[TGSI_FULL_MAX_TEX_OFFSETS];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c

index e04f407..7a02e27 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -187,13 +187,28 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                    }
  
                    if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                      info->reads_position &&
-                      src->Register.Index == 0 &&
-                      (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||
-                       src->Register.SwizzleW == TGSI_SWIZZLE_Z)) {
-                     info->reads_z = TRUE;
+                      !src->Register.Indirect) {
+                     unsigned name =
+                        info->input_semantic_name[src->Register.Index];
+                     unsigned index =
+                        info->input_semantic_index[src->Register.Index];
+
+                     if (name == TGSI_SEMANTIC_POSITION &&
+                         (src->Register.SwizzleX == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleY == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleZ == TGSI_SWIZZLE_Z ||
+                          src->Register.SwizzleW == TGSI_SWIZZLE_Z))
+                        info->reads_z = TRUE;
+
+                     if (name == TGSI_SEMANTIC_COLOR) {
+                        unsigned mask =
+                              (1 << src->Register.SwizzleX) |
+                              (1 << src->Register.SwizzleY) |
+                              (1 << src->Register.SwizzleZ) |
+                              (1 << src->Register.SwizzleW);
+
+                        info->colors_read |= mask << (index * 4);
+                     }
                    }
                 }
  
@@ -358,7 +373,10 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                       info->uses_primid = TRUE;
                    } else if (semName == TGSI_SEMANTIC_INVOCATIONID) {
                       info->uses_invocationid = TRUE;
-                  }
+                  } else if (semName == TGSI_SEMANTIC_POSITION)
+                     info->reads_position = TRUE;
+                  else if (semName == TGSI_SEMANTIC_FACE)
+                     info->uses_frontface = TRUE;
                 }
                 else if (file == TGSI_FILE_OUTPUT) {
                    info->output_semantic_name[reg] = (ubyte) semName;
@@ -392,6 +410,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                       }
                       else if (semName == TGSI_SEMANTIC_STENCIL) {
                          info->writes_stencil = TRUE;
+                     } else if (semName == TGSI_SEMANTIC_SAMPLEMASK) {
+                        info->writes_samplemask = TRUE;
                       }
                    }
  
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h

index 7e9a559..b0b423a 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -77,11 +77,13 @@ struct tgsi_shader_info
  
     uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
  
+   ubyte colors_read; /**< which color components are read by the FS */
     ubyte colors_written;
     boolean reads_position; /**< does fragment shader read position? */
     boolean reads_z; /**< does fragment shader read depth? */
     boolean writes_z;  /**< does fragment shader write Z value? */
     boolean writes_stencil; /**< does fragment shader write stencil value? */
+   boolean writes_samplemask; /**< does fragment shader write sample mask? */
     boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
     boolean uses_kill;  /**< KILL or KILL_IF instruction used? */
     boolean uses_persp_center;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c

index fc29a23..f2d70d4 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -54,8 +54,9 @@ static const char *tgsi_file_names[] =
     "IMM",
     "PRED",
     "SV",
-   "RES",
-   "SVIEW"
+   "IMAGE",
+   "SVIEW",
+   "BUFFER",
  };
  
  const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
@@ -96,6 +97,8 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
     "TESSINNER",
     "VERTICESIN",
     "HELPER_INVOCATION",
+   "BASEINSTANCE",
+   "DRAWID",
  };
  
  const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
@@ -205,6 +208,13 @@ const char *tgsi_immediate_type_names[4] =
     "FLT64"
  };
  
+const char *tgsi_memory_names[3] =
+{
+   "COHERENT",
+   "RESTRICT",
+   "VOLATILE",
+};
+
  
  static inline void
  tgsi_strings_check(void)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.h b/src/gallium/auxiliary/tgsi/tgsi_strings.h

index 71e7437..031d322 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.h
@@ -60,6 +60,8 @@ extern const char *tgsi_fs_coord_pixel_center_names[2];
  
  extern const char *tgsi_immediate_type_names[4];
  
+extern const char *tgsi_memory_names[3];
+
  
  const char *
  tgsi_file_name(unsigned file);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c

index 4a82c9b..97b1869 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1039,6 +1039,12 @@ parse_instruction(
        inst.Texture.Texture = TGSI_TEXTURE_UNKNOWN;
     }
  
+   if ((i >= TGSI_OPCODE_LOAD && i <= TGSI_OPCODE_ATOMIMAX) ||
+       i == TGSI_OPCODE_RESQ) {
+      inst.Instruction.Memory = 1;
+      inst.Memory.Qualifier = 0;
+   }
+
     /* Parse instruction operands.
      */
     for (i = 0; i < info->num_dst + info->num_src + info->is_tex; i++) {
@@ -1091,6 +1097,27 @@ parse_instruction(
     inst.Texture.NumOffsets = i;
  
     cur = ctx->cur;
+   eat_opt_white(&cur);
+   for (i = 0; inst.Instruction.Memory && *cur == ','; i++) {
+      uint j;
+      cur++;
+      eat_opt_white(&cur);
+      ctx->cur = cur;
+      for (j = 0; j < 3; j++) {
+         if (str_match_nocase_whole(&ctx->cur, tgsi_memory_names[j])) {
+            inst.Memory.Qualifier |= 1U << j;
+            break;
+         }
+      }
+      if (j == 3) {
+         report_error(ctx, "Expected memory qualifier");
+         return FALSE;
+      }
+      cur = ctx->cur;
+      eat_opt_white(&cur);
+   }
+
+   cur = ctx->cur;
     eat_opt_white( &cur );
     if (info->is_branch && *cur == ':') {
        uint target;
@@ -1251,10 +1278,10 @@ static boolean parse_declaration( struct translate_ctx *ctx )
  
        cur++;
        eat_opt_white( &cur );
-      if (file == TGSI_FILE_RESOURCE) {
+      if (file == TGSI_FILE_IMAGE) {
           for (i = 0; i < TGSI_TEXTURE_COUNT; i++) {
              if (str_match_nocase_whole(&cur, tgsi_texture_names[i])) {
-               decl.Resource.Resource = i;
+               decl.Image.Resource = i;
                 break;
              }
           }
@@ -1263,16 +1290,18 @@ static boolean parse_declaration( struct translate_ctx *ctx )
              return FALSE;
           }
  
+         /* XXX format */
+
           cur2 = cur;
           eat_opt_white(&cur2);
           while (*cur2 == ',') {
              cur2++;
              eat_opt_white(&cur2);
              if (str_match_nocase_whole(&cur2, "RAW")) {
-               decl.Resource.Raw = 1;
+               decl.Image.Raw = 1;
  
              } else if (str_match_nocase_whole(&cur2, "WR")) {
-               decl.Resource.Writable = 1;
+               decl.Image.Writable = 1;
  
              } else {
                 break;
@@ -1348,6 +1377,11 @@ static boolean parse_declaration( struct translate_ctx *ctx )
                 decl.SamplerView.ReturnTypeX;
           }
           ctx->cur = cur;
+      } else if (file == TGSI_FILE_BUFFER) {
+         if (str_match_nocase_whole(&cur, "ATOMIC")) {
+            decl.Declaration.Atomic = 1;
+            ctx->cur = cur;
+         }
        } else {
           if (str_match_nocase_whole(&cur, "LOCAL")) {
              decl.Declaration.Local = 1;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c

index 4aaf8df..d681150 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -50,6 +50,7 @@ union tgsi_any_token {
     struct tgsi_declaration_range decl_range;
     struct tgsi_declaration_dimension decl_dim;
     struct tgsi_declaration_interp decl_interp;
+   struct tgsi_declaration_image decl_image;
     struct tgsi_declaration_semantic decl_semantic;
     struct tgsi_declaration_sampler_view decl_sampler_view;
     struct tgsi_declaration_array array;
@@ -59,6 +60,7 @@ union tgsi_any_token {
     struct tgsi_instruction_predicate insn_predicate;
     struct tgsi_instruction_label insn_label;
     struct tgsi_instruction_texture insn_texture;
+   struct tgsi_instruction_memory insn_memory;
     struct tgsi_texture_offset insn_texture_offset;
     struct tgsi_src_register src;
     struct tgsi_ind_register ind;
@@ -115,7 +117,6 @@ struct ureg_program
     unsigned vs_inputs[PIPE_MAX_ATTRIBS/32];
  
     struct {
-      unsigned index;
        unsigned semantic_name;
        unsigned semantic_index;
     } system_value[UREG_MAX_SYSTEM_VALUE];
@@ -155,6 +156,21 @@ struct ureg_program
     } sampler_view[PIPE_MAX_SHADER_SAMPLER_VIEWS];
     unsigned nr_sampler_views;
  
+   struct {
+      unsigned index;
+      unsigned target;
+      unsigned format;
+      boolean wr;
+      boolean raw;
+   } image[PIPE_MAX_SHADER_IMAGES];
+   unsigned nr_images;
+
+   struct {
+      unsigned index;
+      bool atomic;
+   } buffer[PIPE_MAX_SHADER_BUFFERS];
+   unsigned nr_buffers;
+
     struct util_bitmask *free_temps;
     struct util_bitmask *local_temps;
     struct util_bitmask *decl_temps;
@@ -320,20 +336,29 @@ ureg_DECL_input(struct ureg_program *ureg,
  
  struct ureg_src
  ureg_DECL_system_value(struct ureg_program *ureg,
-                       unsigned index,
                         unsigned semantic_name,
                         unsigned semantic_index)
  {
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_system_values; i++) {
+      if (ureg->system_value[i].semantic_name == semantic_name &&
+          ureg->system_value[i].semantic_index == semantic_index) {
+         goto out;
+      }
+   }
+
     if (ureg->nr_system_values < UREG_MAX_SYSTEM_VALUE) {
-      ureg->system_value[ureg->nr_system_values].index = index;
        ureg->system_value[ureg->nr_system_values].semantic_name = semantic_name;
        ureg->system_value[ureg->nr_system_values].semantic_index = semantic_index;
+      i = ureg->nr_system_values;
        ureg->nr_system_values++;
     } else {
        set_bad(ureg);
     }
  
-   return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, index);
+out:
+   return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, i);
  }
  
  
@@ -648,6 +673,60 @@ ureg_DECL_sampler_view(struct ureg_program *ureg,
     return reg;
  }
  
+/* Allocate a new image.
+ */
+struct ureg_src
+ureg_DECL_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw)
+{
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_IMAGE, index);
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_images; i++)
+      if (ureg->image[i].index == index)
+         return reg;
+
+   if (i < PIPE_MAX_SHADER_IMAGES) {
+      ureg->image[i].index = index;
+      ureg->image[i].target = target;
+      ureg->image[i].wr = wr;
+      ureg->image[i].raw = raw;
+      ureg->image[i].format = format;
+      ureg->nr_images++;
+      return reg;
+   }
+
+   assert(0);
+   return reg;
+}
+
+/* Allocate a new buffer.
+ */
+struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr,
+                                 bool atomic)
+{
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_BUFFER, nr);
+   unsigned i;
+
+   for (i = 0; i < ureg->nr_buffers; i++)
+      if (ureg->buffer[i].index == nr)
+         return reg;
+
+   if (i < PIPE_MAX_SHADER_BUFFERS) {
+      ureg->buffer[i].index = nr;
+      ureg->buffer[i].atomic = atomic;
+      ureg->nr_buffers++;
+      return reg;
+   }
+
+   assert(0);
+   return reg;
+}
+
  static int
  match_or_expand_immediate64( const unsigned *v,
                               int type,
@@ -1148,6 +1227,21 @@ ureg_emit_texture_offset(struct ureg_program *ureg,
     
  }
  
+void
+ureg_emit_memory(struct ureg_program *ureg,
+                 unsigned extended_token,
+                 unsigned qualifier)
+{
+   union tgsi_any_token *out, *insn;
+
+   out = get_tokens( ureg, DOMAIN_INSN, 1 );
+   insn = retrieve_token( ureg, DOMAIN_INSN, extended_token );
+
+   insn->insn.Memory = 1;
+
+   out[0].value = 0;
+   out[0].insn_memory.Qualifier = qualifier;
+}
  
  void
  ureg_fixup_insn_size(struct ureg_program *ureg,
@@ -1300,6 +1394,42 @@ ureg_label_insn(struct ureg_program *ureg,
  }
  
  
+void
+ureg_memory_insn(struct ureg_program *ureg,
+                 unsigned opcode,
+                 const struct ureg_dst *dst,
+                 unsigned nr_dst,
+                 const struct ureg_src *src,
+                 unsigned nr_src,
+                 unsigned qualifier)
+{
+   struct ureg_emit_insn_result insn;
+   unsigned i;
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         FALSE,
+                         FALSE,
+                         FALSE,
+                         TGSI_SWIZZLE_X,
+                         TGSI_SWIZZLE_Y,
+                         TGSI_SWIZZLE_Z,
+                         TGSI_SWIZZLE_W,
+                         nr_dst,
+                         nr_src);
+
+   ureg_emit_memory(ureg, insn.extended_token, qualifier);
+
+   for (i = 0; i < nr_dst; i++)
+      ureg_emit_dst(ureg, dst[i]);
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src(ureg, src[i]);
+
+   ureg_fixup_insn_size(ureg, insn.insn_token);
+}
+
+
  static void
  emit_decl_semantic(struct ureg_program *ureg,
                     unsigned file,
@@ -1478,6 +1608,52 @@ emit_decl_sampler_view(struct ureg_program *ureg,
  }
  
  static void
+emit_decl_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
+
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 3;
+   out[0].decl.File = TGSI_FILE_IMAGE;
+   out[0].decl.UsageMask = 0xf;
+
+   out[1].value = 0;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
+
+   out[2].value = 0;
+   out[2].decl_image.Resource = target;
+   out[2].decl_image.Writable = wr;
+   out[2].decl_image.Raw      = raw;
+   out[2].decl_image.Format   = format;
+}
+
+static void
+emit_decl_buffer(struct ureg_program *ureg,
+                 unsigned index,
+                 bool atomic)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);
+
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 2;
+   out[0].decl.File = TGSI_FILE_BUFFER;
+   out[0].decl.UsageMask = 0xf;
+   out[0].decl.Atomic = atomic;
+
+   out[1].value = 0;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
+}
+
+static void
  emit_immediate( struct ureg_program *ureg,
                  const unsigned *v,
                  unsigned type )
@@ -1587,8 +1763,8 @@ static void emit_decls( struct ureg_program *ureg )
     for (i = 0; i < ureg->nr_system_values; i++) {
        emit_decl_semantic(ureg,
                           TGSI_FILE_SYSTEM_VALUE,
-                         ureg->system_value[i].index,
-                         ureg->system_value[i].index,
+                         i,
+                         i,
                           ureg->system_value[i].semantic_name,
                           ureg->system_value[i].semantic_index,
                           TGSI_WRITEMASK_XYZW, 0);
@@ -1636,6 +1812,19 @@ static void emit_decls( struct ureg_program *ureg )
                               ureg->sampler_view[i].return_type_w);
     }
  
+   for (i = 0; i < ureg->nr_images; i++) {
+      emit_decl_image(ureg,
+                      ureg->image[i].index,
+                      ureg->image[i].target,
+                      ureg->image[i].format,
+                      ureg->image[i].wr,
+                      ureg->image[i].raw);
+   }
+
+   for (i = 0; i < ureg->nr_buffers; i++) {
+      emit_decl_buffer(ureg, ureg->buffer[i].index, ureg->buffer[i].atomic);
+   }
+
     if (ureg->const_decls.nr_constant_ranges) {
        for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
           emit_decl_range(ureg,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h

index 0aae550..86e58a9 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -221,7 +221,6 @@ ureg_DECL_input(struct ureg_program *,
  
  struct ureg_src
  ureg_DECL_system_value(struct ureg_program *,
-                       unsigned index,
                         unsigned semantic_name,
                         unsigned semantic_index);
  
@@ -327,6 +326,16 @@ ureg_DECL_sampler_view(struct ureg_program *,
                         unsigned return_type_z,
                         unsigned return_type_w );
  
+struct ureg_src
+ureg_DECL_image(struct ureg_program *ureg,
+                unsigned index,
+                unsigned target,
+                unsigned format,
+                boolean wr,
+                boolean raw);
+
+struct ureg_src
+ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, bool atomic);
  
  static inline struct ureg_src
  ureg_imm4f( struct ureg_program *ureg,
@@ -522,6 +531,14 @@ ureg_label_insn(struct ureg_program *ureg,
                  unsigned nr_src,
                  unsigned *label);
  
+void
+ureg_memory_insn(struct ureg_program *ureg,
+                 unsigned opcode,
+                 const struct ureg_dst *dst,
+                 unsigned nr_dst,
+                 const struct ureg_src *src,
+                 unsigned nr_src,
+                 unsigned qualifier);
  
  /***********************************************************************
   * Internal instruction helpers, don't call these directly:
@@ -559,6 +576,11 @@ void
  ureg_emit_texture_offset(struct ureg_program *ureg,
                           const struct tgsi_texture_offset *offset);
  
+void
+ureg_emit_memory(struct ureg_program *ureg,
+                 unsigned insn_token,
+                 unsigned qualifier);
+
  void 
  ureg_emit_dst( struct ureg_program *ureg,
                 struct ureg_dst dst );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c

index 653e650..5fff3f0 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -29,6 +29,7 @@
  #include "pipe/p_shader_tokens.h"
  #include "tgsi_parse.h"
  #include "tgsi_util.h"
+#include "tgsi_exec.h"
  
  union pointer_hack
  {
@@ -53,17 +54,17 @@ tgsi_util_get_src_register_swizzle(
     const struct tgsi_src_register *reg,
     unsigned component )
  {
-   switch( component ) {
-   case 0:
+   switch (component) {
+   case TGSI_CHAN_X:
        return reg->SwizzleX;
-   case 1:
+   case TGSI_CHAN_Y:
        return reg->SwizzleY;
-   case 2:
+   case TGSI_CHAN_Z:
        return reg->SwizzleZ;
-   case 3:
+   case TGSI_CHAN_W:
        return reg->SwizzleW;
     default:
-      assert( 0 );
+      assert(0);
     }
     return 0;
  }
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c

index 05b4567..43fbd8e 100644 (file)
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -320,7 +320,8 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
     for (i = 0; i < 4; i++)
        ctx->vertices[i][0][3] = 1; /*v.w*/
  
-   ctx->upload = u_upload_create(pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+   ctx->upload = u_upload_create(pipe, 65536, PIPE_BIND_VERTEX_BUFFER,
+                                 PIPE_USAGE_STREAM);
  
     return &ctx->base;
  }
@@ -1191,7 +1192,7 @@ static void blitter_draw(struct blitter_context_priv *ctx,
  
     vb.stride = 8 * sizeof(float);
  
-   u_upload_data(ctx->upload, 0, sizeof(ctx->vertices), ctx->vertices,
+   u_upload_data(ctx->upload, 0, sizeof(ctx->vertices), 4, ctx->vertices,
                   &vb.buffer_offset, &vb.buffer);
     if (!vb.buffer)
        return;
@@ -2111,7 +2112,7 @@ void util_blitter_clear_buffer(struct blitter_context *blitter,
        return;
     }
  
-   u_upload_data(ctx->upload, 0, num_channels*4, clear_value,
+   u_upload_data(ctx->upload, 0, num_channels*4, 4, clear_value,
                   &vb.buffer_offset, &vb.buffer);
     if (!vb.buffer)
        goto out;
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c

index cb162d8..2b60559 100644 (file)
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -727,6 +727,65 @@ error1:
     ;
  }
  
+void
+debug_dump_ubyte_rgba_bmp(const char *filename,
+                          unsigned width, unsigned height,
+                          const ubyte *rgba, unsigned stride)
+{
+   FILE *stream;
+   struct bmp_file_header bmfh;
+   struct bmp_info_header bmih;
+   unsigned x, y;
+
+   assert(rgba);
+   if(!rgba)
+      goto error1;
+
+   bmfh.bfType = 0x4d42;
+   bmfh.bfSize = 14 + 40 + height*width*4;
+   bmfh.bfReserved1 = 0;
+   bmfh.bfReserved2 = 0;
+   bmfh.bfOffBits = 14 + 40;
+
+   bmih.biSize = 40;
+   bmih.biWidth = width;
+   bmih.biHeight = height;
+   bmih.biPlanes = 1;
+   bmih.biBitCount = 32;
+   bmih.biCompression = 0;
+   bmih.biSizeImage = height*width*4;
+   bmih.biXPelsPerMeter = 0;
+   bmih.biYPelsPerMeter = 0;
+   bmih.biClrUsed = 0;
+   bmih.biClrImportant = 0;
+
+   stream = fopen(filename, "wb");
+   assert(stream);
+   if(!stream)
+      goto error1;
+
+   fwrite(&bmfh, 14, 1, stream);
+   fwrite(&bmih, 40, 1, stream);
+
+   y = height;
+   while(y--) {
+      const ubyte *ptr = rgba + (stride * y * 4);
+      for(x = 0; x < width; ++x)
+      {
+         struct bmp_rgb_quad pixel;
+         pixel.rgbRed   = ptr[x*4 + 0];
+         pixel.rgbGreen = ptr[x*4 + 1];
+         pixel.rgbBlue  = ptr[x*4 + 2];
+         pixel.rgbAlpha = ptr[x*4 + 3];
+         fwrite(&pixel, 1, 4, stream);
+      }
+   }
+
+   fclose(stream);
+error1:
+   ;
+}
+
  
  /**
   * Print PIPE_TRANSFER_x flags with a message.
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h

index 5307072..671bd37 100644 (file)
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -404,6 +404,19 @@ debug_get_flags_option(const char *name,
                         const struct debug_named_value *flags,
                         uint64_t dfault);
  
+#define DEBUG_GET_ONCE_OPTION(suffix, name, dfault) \
+static const char * \
+debug_get_option_ ## suffix (void) \
+{ \
+   static boolean first = TRUE; \
+   static const char * value; \
+   if (first) { \
+      first = FALSE; \
+      value = debug_get_option(name, dfault); \
+   } \
+   return value; \
+}
+
  #define DEBUG_GET_ONCE_BOOL_OPTION(sufix, name, dfault) \
  static boolean \
  debug_get_option_ ## sufix (void) \
@@ -477,12 +490,16 @@ void debug_dump_transfer_bmp(struct pipe_context *pipe,
  void debug_dump_float_rgba_bmp(const char *filename,
                                 unsigned width, unsigned height,
                                 float *rgba, unsigned stride);
+void debug_dump_ubyte_rgba_bmp(const char *filename,
+                               unsigned width, unsigned height,
+                               const ubyte *rgba, unsigned stride);
  #else
  #define debug_dump_image(prefix, format, cpp, width, height, stride, data) ((void)0)
  #define debug_dump_surface(pipe, prefix, surface) ((void)0)
  #define debug_dump_surface_bmp(pipe, filename, surface) ((void)0)
  #define debug_dump_transfer_bmp(filename, transfer, ptr) ((void)0)
  #define debug_dump_float_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
+#define debug_dump_ubyte_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
  #endif
  
  
diff --git a/src/gallium/auxiliary/util/u_init.h b/src/gallium/auxiliary/util/u_init.h

deleted file mode 100644 (file)

index 7bc356a..0000000
--- a/src/gallium/auxiliary/util/u_init.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2010 Luca Barbieri
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef U_INIT_H
-#define U_INIT_H
-
-/* Use UTIL_INIT(f) to have f called at program initialization.
-   Note that it is only guaranteed to be called if any symbol in the
-   .c file it is in sis referenced by the program.
-
-   UTIL_INIT functions are called in arbitrary order.
-*/
-
-#ifdef __cplusplus
-/* use a C++ global constructor */
-#define UTIL_INIT(f) struct f##__gctor_t {f##__gctor_t() {x();}} f##__gctor;
-#elif defined(_MSC_VER)
-/* add a pointer to the section where MSVC stores global constructor pointers */
-/* see http://blogs.msdn.com/vcblog/archive/2006/10/20/crt-initialization.aspx and
-   http://stackoverflow.com/questions/1113409/attribute-constructor-equivalent-in-vc */
-#pragma section(".CRT$XCU",read)
-#define UTIL_INIT(f) static void __cdecl f##__init(void) {f();}; __declspec(allocate(".CRT$XCU")) void (__cdecl* f##__xcu)(void) = f##__init;
-#elif defined(__GNUC__)
-#define UTIL_INIT(f) static void f##__init(void) __attribute__((constructor)); static void f##__init(void) {f();}
-#else
-#error Unsupported compiler: please find out how to implement global initializers in C on it
-#endif
-
-#endif
-
diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c

index 0bb46ff..08dec13 100644 (file)
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -177,6 +177,7 @@ struct pstip_transform_context {
     struct tgsi_shader_info info;
     uint tempsUsed;  /**< bitmask */
     int wincoordInput;
+   unsigned wincoordFile;
     int maxInput;
     uint samplersUsed;  /**< bitfield of samplers used */
     int freeSampler;  /** an available sampler for the pstipple */
@@ -206,7 +207,7 @@ pstip_transform_decl(struct tgsi_transform_context *ctx,
           pctx->samplersUsed |= 1 << i;
        }
     }
-   else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+   else if (decl->Declaration.File == pctx->wincoordFile) {
        pctx->maxInput = MAX2(pctx->maxInput, (int) decl->Range.Last);
        if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION)
           pctx->wincoordInput = (int) decl->Range.First;
@@ -275,10 +276,22 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
        wincoordInput = pctx->wincoordInput;
  
     if (pctx->wincoordInput < 0) {
+      struct tgsi_full_declaration decl;
+
+      decl = tgsi_default_full_declaration();
        /* declare new position input reg */
-      tgsi_transform_input_decl(ctx, wincoordInput,
-                                TGSI_SEMANTIC_POSITION, 1,
-                                TGSI_INTERPOLATE_LINEAR);
+      decl.Declaration.File = pctx->wincoordFile;
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.Name = TGSI_SEMANTIC_POSITION;
+      decl.Range.First =
+      decl.Range.Last = wincoordInput;
+
+      if (pctx->wincoordFile == TGSI_FILE_INPUT) {
+         decl.Declaration.Interpolate = 1;
+         decl.Interp.Interpolate = TGSI_INTERPOLATE_LINEAR;
+      }
+
+      ctx->emit_declaration(ctx, &decl);
     }
  
     sampIdx = pctx->hasFixedUnit ? pctx->fixedUnit : pctx->freeSampler;
@@ -327,7 +340,7 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
     tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
                             TGSI_FILE_TEMPORARY, texTemp,
                             TGSI_WRITEMASK_XYZW,
-                           TGSI_FILE_INPUT, wincoordInput,
+                           pctx->wincoordFile, wincoordInput,
                             TGSI_FILE_IMMEDIATE, pctx->numImmed);
  
     /* TEX texTemp, texTemp, sampler; */
@@ -351,11 +364,15 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
   *                        will be used to sample the stipple texture;
   *                        if NULL, the fixed unit is used
   * \param fixedUnit       fixed texture unit used for the stipple texture
+ * \param wincoordFile    TGSI_FILE_INPUT or TGSI_FILE_SYSTEM_VALUE,
+ *                        depending on which one is supported by the driver
+ *                        for TGSI_SEMANTIC_POSITION in the fragment shader
   */
  struct tgsi_token *
  util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
                                       unsigned *samplerUnitOut,
-                                     unsigned fixedUnit)
+                                     unsigned fixedUnit,
+                                     unsigned wincoordFile)
  {
     struct pstip_transform_context transform;
     const uint newLen = tgsi_num_tokens(tokens) + NUM_NEW_TOKENS;
@@ -370,6 +387,7 @@ util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
      */
     memset(&transform, 0, sizeof(transform));
     transform.wincoordInput = -1;
+   transform.wincoordFile = wincoordFile;
     transform.maxInput = -1;
     transform.coordOrigin = TGSI_FS_COORD_ORIGIN_UPPER_LEFT;
     transform.hasFixedUnit = !samplerUnitOut;
diff --git a/src/gallium/auxiliary/util/u_pstipple.h b/src/gallium/auxiliary/util/u_pstipple.h

index 249c58b..ef8396f 100644 (file)
--- a/src/gallium/auxiliary/util/u_pstipple.h
+++ b/src/gallium/auxiliary/util/u_pstipple.h
@@ -50,7 +50,8 @@ util_pstipple_create_sampler(struct pipe_context *pipe);
  struct tgsi_token *
  util_pstipple_create_fragment_shader(const struct tgsi_token *tokens,
                                       unsigned *samplerUnitOut,
-                                     unsigned fixed_unit);
+                                     unsigned fixed_unit,
+                                     unsigned wincoordFile);
  
  
  #endif
diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h

new file mode 100644 (file)

index 0000000..1eca6d6
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_pwr8.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2015 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Oded Gabbay <oded.gabbay@redhat.com>
+ */
+
+/**
+ * @file
+ * POWER8 intrinsics portability header.
+ *
+ */
+
+#ifndef U_PWR8_H_
+#define U_PWR8_H_
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16)))
+
+typedef VECTOR_ALIGN_16 vector unsigned char __m128i;
+
+typedef VECTOR_ALIGN_16 union m128i {
+   __m128i m128i;
+   vector signed int m128si;
+   vector unsigned int m128ui;
+   ubyte ub[16];
+   ushort us[8];
+   int i[4];
+   uint ui[4];
+} __m128i_union;
+
+static inline __m128i
+vec_set_epi32 (int i3, int i2, int i1, int i0)
+{
+   __m128i_union vdst;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vdst.i[0] = i0;
+   vdst.i[1] = i1;
+   vdst.i[2] = i2;
+   vdst.i[3] = i3;
+#else
+   vdst.i[3] = i0;
+   vdst.i[2] = i1;
+   vdst.i[1] = i2;
+   vdst.i[0] = i3;
+#endif
+
+   return (__m128i) vdst.m128si;
+}
+
+static inline __m128i
+vec_setr_epi32 (int i0, int i1, int i2, int i3)
+{
+  return vec_set_epi32 (i3, i2, i1, i0);
+}
+
+static inline __m128i
+vec_unpacklo_epi32 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23};
+#else
+      {24, 25, 26, 27,  8,  9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpackhi_epi32 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
+#else
+      {16, 17, 18, 19,  0,  1,  2,  3, 20, 21, 22, 23,  4,  5,  6,  7};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpacklo_epi64 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23};
+#else
+      {24, 25, 26, 27, 28, 29, 30, 31,  8,  9, 10, 11, 12, 13, 14, 15};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpackhi_epi64 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      { 8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+#else
+      {16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_add_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_add ((vector signed int) a, (vector signed int) b);
+}
+
+static inline __m128i
+vec_sub_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b);
+}
+
+/* Call this function ONLY on POWER8 and newer platforms */
+static inline __m128i
+vec_mullo_epi32 (__m128i a, __m128i b)
+{
+   __m128i v;
+
+   __asm__(
+           "vmuluwm %0, %1, %2   \n"
+           : "=v" (v)
+           : "v" (a), "v" (b)
+           );
+
+   return v;
+}
+
+static inline void
+transpose4_epi32(const __m128i * restrict a,
+                 const __m128i * restrict b,
+                 const __m128i * restrict c,
+                 const __m128i * restrict d,
+                 __m128i * restrict o,
+                 __m128i * restrict p,
+                 __m128i * restrict q,
+                 __m128i * restrict r)
+{
+   __m128i t0 = vec_unpacklo_epi32(*a, *b);
+   __m128i t1 = vec_unpacklo_epi32(*c, *d);
+   __m128i t2 = vec_unpackhi_epi32(*a, *b);
+   __m128i t3 = vec_unpackhi_epi32(*c, *d);
+
+   *o = vec_unpacklo_epi64(t0, t1);
+   *p = vec_unpackhi_epi64(t0, t1);
+   *q = vec_unpacklo_epi64(t2, t3);
+   *r = vec_unpackhi_epi64(t2, t3);
+}
+
+static inline __m128i
+vec_slli_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_srli_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_srai_epi32 (__m128i vsrc, unsigned int count)
+{
+   __m128i_union vec_count;
+
+   if (count >= 32)
+      return (__m128i) vec_splats (0);
+   else if (count == 0)
+      return vsrc;
+
+   /* In VMX, all shift count fields must contain the same value */
+   vec_count.m128si = (vector signed int) vec_splats (count);
+   return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui);
+}
+
+static inline __m128i
+vec_cmpeq_epi32 (__m128i a, __m128i b)
+{
+   return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) b);
+}
+
+static inline __m128i
+vec_loadu_si128 (const uint32_t* src)
+{
+   __m128i_union vsrc;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+
+   vsrc.m128ui = *((vector unsigned int *) src);
+
+#else
+
+   __m128i vmask, tmp1, tmp2;
+
+   vmask = vec_lvsl(0, src);
+
+   tmp1 = (__m128i) vec_ld (0, src);
+   tmp2 = (__m128i) vec_ld (15, src);
+   vsrc.m128ui = (vector unsigned int) vec_perm (tmp1, tmp2, vmask);
+
+#endif
+
+   return vsrc.m128i;
+}
+
+static inline __m128i
+vec_load_si128 (const uint32_t* src)
+{
+   __m128i_union vsrc;
+
+   vsrc.m128ui = *((vector unsigned int *) src);
+
+   return vsrc.m128i;
+}
+
+static inline void
+vec_store_si128 (uint32_t* dest, __m128i vdata)
+{
+   vec_st ((vector unsigned int) vdata, 0, dest);
+}
+
+/* Call this function ONLY on POWER8 and newer platforms */
+static inline int
+vec_movemask_epi8 (__m128i vsrc)
+{
+   __m128i_union vtemp;
+   int result;
+
+   vtemp.m128i = vec_vgbbd(vsrc);
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   result = vtemp.ub[15] << 8 | vtemp.ub[7];
+#else
+   result = vtemp.ub[0] << 8 | vtemp.ub[8];
+#endif
+
+   return result;
+}
+
+static inline __m128i
+vec_packs_epi16 (__m128i a, __m128i b)
+{
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   return (__m128i) vec_packs ((vector signed short) a,
+                               (vector signed short) b);
+#else
+   return (__m128i) vec_packs ((vector signed short) b,
+                               (vector signed short) a);
+#endif
+}
+
+static inline __m128i
+vec_packs_epi32 (__m128i a, __m128i b)
+{
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b);
+#else
+   return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a);
+#endif
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+#endif /* U_PWR8_H_ */
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h

index cda0f2e..fe1917f 100644 (file)
--- a/src/gallium/auxiliary/util/u_simple_shaders.h
+++ b/src/gallium/auxiliary/util/u_simple_shaders.h
@@ -31,6 +31,7 @@
  
  
  #include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
  
  
  struct pipe_context;
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h

index 7f8e5a1..cae4138 100644 (file)
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -166,14 +166,49 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
  #endif /* !PIPE_ARCH_SSSE3 */
  
  
+/*
+ * Provide an SSE implementation of _mm_mul_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * Basically, albeit surprising at first (and second, and third...) look
+ * if a * b is done signed instead of unsigned, can just
+ * subtract b from the high bits of the result if a is negative
+ * (and the same for a if b is negative). Modular arithmetic at its best!
+ *
+ * So for int32 a,b in crude pseudo-code ("*" here denoting a widening mul)
+ * fixupb = (signmask(b) & a) << 32ULL
+ * fixupa = (signmask(a) & b) << 32ULL
+ * a * b = (unsigned)a * (unsigned)b - fixupb - fixupa
+ * = (unsigned)a * (unsigned)b -(fixupb + fixupa)
+ *
+ * This does both lo (dwords 0/2) and hi parts (1/3) at the same time due
+ * to some optimization potential.
+ */
+static inline __m128i
+mm_mullohi_epi32(const __m128i a, const __m128i b, __m128i *res13)
+{
+   __m128i a13, b13, mul02, mul13;
+   __m128i anegmask, bnegmask, fixup, fixup02, fixup13;
+   a13 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2,3,0,1));
+   b13 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2,3,0,1));
+   anegmask = _mm_srai_epi32(a, 31);
+   bnegmask = _mm_srai_epi32(b, 31);
+   fixup = _mm_add_epi32(_mm_and_si128(anegmask, b),
+                         _mm_and_si128(bnegmask, a));
+   mul02 = _mm_mul_epu32(a, b);
+   mul13 = _mm_mul_epu32(a13, b13);
+   fixup02 = _mm_slli_epi64(fixup, 32);
+   fixup13 = _mm_and_si128(fixup, _mm_set_epi32(-1,0,-1,0));
+   *res13 = _mm_sub_epi64(mul13, fixup13);
+   return _mm_sub_epi64(mul02, fixup02);
+}
  
  
  /* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
   * _mm_mul_epu32().
   *
- * I suspect this works fine for us because one of our operands is
- * always positive, but not sure that this can be used for general
- * signed integer multiplication.
+ * This always works regardless the signs of the operands, since
+ * the high bits (which would be different) aren't used.
   *
   * This seems close enough to the speed of SSE4 and the real
   * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
@@ -188,6 +223,12 @@ static inline __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
  
     /* Interleave the results, either with shuffles or (slightly
      * faster) direct bit operations:
+    * XXX: might be only true for some cpus (in particular 65nm
+    * Core 2). On most cpus (including that Core 2, but not Nehalem...)
+    * using _mm_shuffle_ps/_mm_shuffle_epi32 might also be faster
+    * than using the 3 instructions below. But logic should be fine
+    * as well, we can't have optimal solution for all cpus (if anything,
+    * should just use _mm_mullo_epi32() if sse41 is available...).
      */
  #if 0
     __m128i ba8             = _mm_shuffle_epi32(ba, 8);
@@ -214,17 +255,44 @@ transpose4_epi32(const __m128i * restrict a,
                   __m128i * restrict q,
                   __m128i * restrict r)
  {
-  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
-  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
-  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
-  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
-
-  *o = _mm_unpacklo_epi64(t0, t1);
-  *p = _mm_unpackhi_epi64(t0, t1);
-  *q = _mm_unpacklo_epi64(t2, t3);
-  *r = _mm_unpackhi_epi64(t2, t3);
+   __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+   __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+   *o = _mm_unpacklo_epi64(t0, t1);
+   *p = _mm_unpackhi_epi64(t0, t1);
+   *q = _mm_unpacklo_epi64(t2, t3);
+   *r = _mm_unpackhi_epi64(t2, t3);
  }
  
+
+/*
+ * Same as above, except the first two values are already interleaved
+ * (i.e. contain 64bit values).
+ */
+static inline void
+transpose2_64_2_32(const __m128i * restrict a01,
+                   const __m128i * restrict a23,
+                   const __m128i * restrict c,
+                   const __m128i * restrict d,
+                   __m128i * restrict o,
+                   __m128i * restrict p,
+                   __m128i * restrict q,
+                   __m128i * restrict r)
+{
+   __m128i t0 = *a01;
+   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+   __m128i t2 = *a23;
+   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+   *o = _mm_unpacklo_epi64(t0, t1);
+   *p = _mm_unpackhi_epi64(t0, t1);
+   *q = _mm_unpacklo_epi64(t2, t3);
+   *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+
  #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
  
  
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c

index 6aa44f9..c150d92 100644 (file)
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -600,7 +600,8 @@ is_box_inside_resource(const struct pipe_resource *res,
        depth = res->array_size;
        assert(res->array_size % 6 == 0);
        break;
-   case PIPE_MAX_TEXTURE_TYPES:;
+   case PIPE_MAX_TEXTURE_TYPES:
+      break;
     }
  
     return box->x >= 0 &&
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c

index b672fad..aa31ef2 100644 (file)
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -42,8 +42,8 @@ struct u_upload_mgr {
     struct pipe_context *pipe;
  
     unsigned default_size;  /* Minimum size of the upload buffer, in bytes. */
-   unsigned alignment;     /* Alignment of each sub-allocation. */
     unsigned bind;          /* Bitmask of PIPE_BIND_* flags. */
+   unsigned usage;         /* PIPE_USAGE_* */
     unsigned map_flags;     /* Bitmask of PIPE_TRANSFER_* flags. */
     boolean map_persistent; /* If persistent mappings are supported. */
  
@@ -55,10 +55,9 @@ struct u_upload_mgr {
  };
  
  
-struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
-                                      unsigned default_size,
-                                      unsigned alignment,
-                                      unsigned bind )
+struct u_upload_mgr *
+u_upload_create(struct pipe_context *pipe, unsigned default_size,
+                unsigned bind, unsigned usage)
  {
     struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr );
     if (!upload)
@@ -66,8 +65,8 @@ struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
  
     upload->pipe = pipe;
     upload->default_size = default_size;
-   upload->alignment = alignment;
     upload->bind = bind;
+   upload->usage = usage;
  
     upload->map_persistent =
        pipe->screen->get_param(pipe->screen,
@@ -149,7 +148,7 @@ u_upload_alloc_buffer(struct u_upload_mgr *upload,
     buffer.target = PIPE_BUFFER;
     buffer.format = PIPE_FORMAT_R8_UNORM; /* want TYPELESS or similar */
     buffer.bind = upload->bind;
-   buffer.usage = PIPE_USAGE_STREAM;
+   buffer.usage = upload->usage;
     buffer.width0 = size;
     buffer.height0 = 1;
     buffer.depth0 = 1;
@@ -181,19 +180,24 @@ void
  u_upload_alloc(struct u_upload_mgr *upload,
                 unsigned min_out_offset,
                 unsigned size,
+               unsigned alignment,
                 unsigned *out_offset,
                 struct pipe_resource **outbuf,
                 void **ptr)
  {
-   unsigned alloc_size = align(size, upload->alignment);
-   unsigned alloc_offset = align(min_out_offset, upload->alignment);
     unsigned buffer_size = upload->buffer ? upload->buffer->width0 : 0;
     unsigned offset;
  
+   min_out_offset = align(min_out_offset, alignment);
+
+   offset = align(upload->offset, alignment);
+   offset = MAX2(offset, min_out_offset);
+
     /* Make sure we have enough space in the upload buffer
-    * for the sub-allocation. */
-   if (unlikely(MAX2(upload->offset, alloc_offset) + alloc_size > buffer_size)) {
-      u_upload_alloc_buffer(upload, alloc_offset + alloc_size);
+    * for the sub-allocation.
+    */
+   if (unlikely(!upload->buffer || offset + size > buffer_size)) {
+      u_upload_alloc_buffer(upload, min_out_offset + size);
  
        if (unlikely(!upload->buffer)) {
           *out_offset = ~0;
@@ -202,11 +206,10 @@ u_upload_alloc(struct u_upload_mgr *upload,
           return;
        }
  
+      offset = min_out_offset;
        buffer_size = upload->buffer->width0;
     }
  
-   offset = MAX2(upload->offset, alloc_offset);
-
     if (unlikely(!upload->map)) {
        upload->map = pipe_buffer_map_range(upload->pipe, upload->buffer,
                                            offset,
@@ -224,8 +227,8 @@ u_upload_alloc(struct u_upload_mgr *upload,
        upload->map -= offset;
     }
  
-   assert(offset < upload->buffer->width0);
-   assert(offset + size <= upload->buffer->width0);
+   assert(offset < buffer_size);
+   assert(offset + size <= buffer_size);
     assert(size);
  
     /* Emit the return values: */
@@ -233,19 +236,20 @@ u_upload_alloc(struct u_upload_mgr *upload,
     pipe_resource_reference(outbuf, upload->buffer);
     *out_offset = offset;
  
-   upload->offset = offset + alloc_size;
+   upload->offset = offset + size;
  }
  
  void u_upload_data(struct u_upload_mgr *upload,
                     unsigned min_out_offset,
                     unsigned size,
+                   unsigned alignment,
                     const void *data,
                     unsigned *out_offset,
                     struct pipe_resource **outbuf)
  {
     uint8_t *ptr;
  
-   u_upload_alloc(upload, min_out_offset, size,
+   u_upload_alloc(upload, min_out_offset, size, alignment,
                    out_offset, outbuf,
                    (void**)&ptr);
     if (ptr)
@@ -257,6 +261,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
                       unsigned min_out_offset,
                       unsigned offset,
                       unsigned size,
+                     unsigned alignment,
                       struct pipe_resource *inbuf,
                       unsigned *out_offset,
                       struct pipe_resource **outbuf)
@@ -278,6 +283,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
     if (0)
        debug_printf("upload ptr %p ofs %d sz %d\n", map, offset, size);
  
-   u_upload_data(upload, min_out_offset, size, map, out_offset, outbuf);
+   u_upload_data(upload, min_out_offset, size, alignment,
+                 map, out_offset, outbuf);
     pipe_buffer_unmap( upload->pipe, transfer );
  }
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h

index 67c6daa..1d933d7 100644 (file)
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -43,13 +43,12 @@ struct pipe_resource;
   *
   * \param pipe          Pipe driver.
   * \param default_size  Minimum size of the upload buffer, in bytes.
- * \param alignment     Alignment of each suballocation in the upload buffer.
   * \param bind          Bitmask of PIPE_BIND_* flags.
+ * \param usage         PIPE_USAGE_*
   */
-struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
-                                      unsigned default_size,
-                                      unsigned alignment,
-                                      unsigned bind );
+struct u_upload_mgr *
+u_upload_create(struct pipe_context *pipe, unsigned default_size,
+                unsigned bind, unsigned usage);
  
  /**
   * Destroy the upload manager.
@@ -74,6 +73,7 @@ void u_upload_unmap( struct u_upload_mgr *upload );
   * \param upload           Upload manager
   * \param min_out_offset   Minimum offset that should be returned in out_offset.
   * \param size             Size of the allocation.
+ * \param alignment        Alignment of the suballocation within the buffer
   * \param out_offset       Pointer to where the new buffer offset will be returned.
   * \param outbuf           Pointer to where the upload buffer will be returned.
   * \param ptr              Pointer to the allocated memory that is returned.
@@ -81,6 +81,7 @@ void u_upload_unmap( struct u_upload_mgr *upload );
  void u_upload_alloc(struct u_upload_mgr *upload,
                      unsigned min_out_offset,
                      unsigned size,
+                    unsigned alignment,
                      unsigned *out_offset,
                      struct pipe_resource **outbuf,
                      void **ptr);
@@ -95,6 +96,7 @@ void u_upload_alloc(struct u_upload_mgr *upload,
  void u_upload_data(struct u_upload_mgr *upload,
                     unsigned min_out_offset,
                     unsigned size,
+                   unsigned alignment,
                     const void *data,
                     unsigned *out_offset,
                     struct pipe_resource **outbuf);
@@ -110,6 +112,7 @@ void u_upload_buffer(struct u_upload_mgr *upload,
                       unsigned min_out_offset,
                       unsigned offset,
                       unsigned size,
+                     unsigned alignment,
                       struct pipe_resource *inbuf,
                       unsigned *out_offset,
                       struct pipe_resource **outbuf);
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c

index 54e9e71..e16ee36 100644 (file)
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -314,8 +314,9 @@ u_vbuf_create(struct pipe_context *pipe,
     mgr->translate_cache = translate_cache_create();
     memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs));
  
-   mgr->uploader = u_upload_create(pipe, 1024 * 1024, 4,
-                                   PIPE_BIND_VERTEX_BUFFER);
+   mgr->uploader = u_upload_create(pipe, 1024 * 1024,
+                                   PIPE_BIND_VERTEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
  
     return mgr;
  }
@@ -454,7 +455,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
  
        /* Create and map the output buffer. */
        u_upload_alloc(mgr->uploader, 0,
-                     key->output_stride * num_indices,
+                     key->output_stride * num_indices, 4,
                       &out_offset, &out_buffer,
                       (void**)&out_map);
        if (!out_buffer)
@@ -487,7 +488,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
        /* Create and map the output buffer. */
        u_upload_alloc(mgr->uploader,
                       key->output_stride * start_vertex,
-                     key->output_stride * num_vertices,
+                     key->output_stride * num_vertices, 4,
                       &out_offset, &out_buffer,
                       (void**)&out_map);
        if (!out_buffer)
@@ -987,7 +988,7 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
        real_vb = &mgr->real_vertex_buffer[i];
        ptr = mgr->vertex_buffer[i].user_buffer;
  
-      u_upload_data(mgr->uploader, start, end - start, ptr + start,
+      u_upload_data(mgr->uploader, start, end - start, 4, ptr + start,
                      &real_vb->buffer_offset, &real_vb->buffer);
        if (!real_vb->buffer)
           return PIPE_ERROR_OUT_OF_MEMORY;
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c

index afe5306..77688f0 100644 (file)
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -716,6 +716,7 @@ gen_vertex_data(struct vl_compositor *c, struct vl_compositor_state *s, struct u
     /* Allocate new memory for vertices. */
     u_upload_alloc(c->upload, 0,
                    c->vertex_buf.stride * VL_COMPOSITOR_MAX_LAYERS * 4, /* size */
+                  4, /* alignment */
                    &c->vertex_buf.buffer_offset, &c->vertex_buf.buffer,
                    (void**)&vb);
  
@@ -1090,7 +1091,8 @@ vl_compositor_init(struct vl_compositor *c, struct pipe_context *pipe)
  
     c->pipe = pipe;
  
-   c->upload = u_upload_create(pipe, 128 * 1024, 4, PIPE_BIND_VERTEX_BUFFER);
+   c->upload = u_upload_create(pipe, 128 * 1024, PIPE_BIND_VERTEX_BUFFER,
+                               PIPE_USAGE_STREAM);
  
     if (!c->upload)
        return false;
diff --git a/src/gallium/auxiliary/vl/vl_deint_filter.c b/src/gallium/auxiliary/vl/vl_deint_filter.c

index 8fa70e8..9e782e5 100644 (file)
--- a/src/gallium/auxiliary/vl/vl_deint_filter.c
+++ b/src/gallium/auxiliary/vl/vl_deint_filter.c
@@ -47,6 +47,7 @@
  #include "util/u_draw.h"
  #include "util/u_memory.h"
  #include "util/u_math.h"
+#include "util/u_format.h"
  
  #include "vl_types.h"
  #include "vl_video_buffer.h"
@@ -214,8 +215,9 @@ create_deint_frag_shader(struct vl_deint_filter *filter, unsigned field,
              ureg_imm4f(shader, -0.02353f, 0, 0, 0));
     ureg_MUL(shader, ureg_saturate(ureg_writemask(t_diff, TGSI_WRITEMASK_X)),
              ureg_src(t_diff), ureg_imm4f(shader, 31.8750f, 0, 0, 0));
-   ureg_LRP(shader, ureg_writemask(o_fragment, TGSI_WRITEMASK_X), ureg_src(t_diff),
+   ureg_LRP(shader, ureg_writemask(t_tex, TGSI_WRITEMASK_X), ureg_src(t_diff),
              ureg_src(t_linear), ureg_src(t_weave));
+   ureg_MOV(shader, o_fragment, ureg_scalar(ureg_src(t_tex), TGSI_SWIZZLE_X));
  
     ureg_release_temporary(shader, t_tex);
     ureg_release_temporary(shader, t_comp_top);
@@ -253,7 +255,13 @@ vl_deint_filter_init(struct vl_deint_filter *filter, struct pipe_context *pipe,
  
     /* TODO: handle other than 4:2:0 subsampling */
     memset(&templ, 0, sizeof(templ));
-   templ.buffer_format = PIPE_FORMAT_YV12;
+   templ.buffer_format = pipe->screen->get_video_param
+   (
+      pipe->screen,
+      PIPE_VIDEO_PROFILE_UNKNOWN,
+      PIPE_VIDEO_ENTRYPOINT_UNKNOWN,
+      PIPE_VIDEO_CAP_PREFERED_FORMAT
+   );
     templ.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420;
     templ.width = video_width;
     templ.height = video_height;
@@ -271,10 +279,20 @@ vl_deint_filter_init(struct vl_deint_filter *filter, struct pipe_context *pipe,
        goto error_rs_state;
  
     memset(&blend, 0, sizeof blend);
-   blend.rt[0].colormask = PIPE_MASK_RGBA;
-   filter->blend = pipe->create_blend_state(pipe, &blend);
-   if (!filter->blend)
-      goto error_blend;
+   blend.rt[0].colormask = PIPE_MASK_R;
+   filter->blend[0] = pipe->create_blend_state(pipe, &blend);
+   if (!filter->blend[0])
+      goto error_blendR;
+
+   blend.rt[0].colormask = PIPE_MASK_G;
+   filter->blend[1] = pipe->create_blend_state(pipe, &blend);
+   if (!filter->blend[1])
+      goto error_blendG;
+
+   blend.rt[0].colormask = PIPE_MASK_B;
+   filter->blend[2] = pipe->create_blend_state(pipe, &blend);
+   if (!filter->blend[2])
+      goto error_blendB;
  
     memset(&sampler, 0, sizeof(sampler));
     sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
@@ -349,9 +367,15 @@ error_quad:
     pipe->delete_sampler_state(pipe, filter->sampler);
  
  error_sampler:
-   pipe->delete_blend_state(pipe, filter->blend);
+   pipe->delete_blend_state(pipe, filter->blend[2]);
+
+error_blendB:
+   pipe->delete_blend_state(pipe, filter->blend[1]);
+
+error_blendG:
+   pipe->delete_blend_state(pipe, filter->blend[0]);
  
-error_blend:
+error_blendR:
     pipe->delete_rasterizer_state(pipe, filter->rs_state);
  
  error_rs_state:
@@ -367,7 +391,9 @@ vl_deint_filter_cleanup(struct vl_deint_filter *filter)
     assert(filter);
  
     filter->pipe->delete_sampler_state(filter->pipe, filter->sampler[0]);
-   filter->pipe->delete_blend_state(filter->pipe, filter->blend);
+   filter->pipe->delete_blend_state(filter->pipe, filter->blend[0]);
+   filter->pipe->delete_blend_state(filter->pipe, filter->blend[1]);
+   filter->pipe->delete_blend_state(filter->pipe, filter->blend[2]);
     filter->pipe->delete_rasterizer_state(filter->pipe, filter->rs_state);
     filter->pipe->delete_vertex_elements_state(filter->pipe, filter->ves);
     pipe_resource_reference(&filter->quad.buffer, NULL);
@@ -420,12 +446,14 @@ vl_deint_filter_render(struct vl_deint_filter *filter,
     struct pipe_sampler_view **next_sv;
     struct pipe_sampler_view *sampler_views[4];
     struct pipe_surface **dst_surfaces;
-   int j;
+   const unsigned *plane_order;
+   int i, j;
  
     assert(filter && prevprev && prev && cur && next && field <= 1);
  
     /* set up destination and source */
     dst_surfaces = filter->video_buffer->get_surfaces(filter->video_buffer);
+   plane_order = vl_video_buffer_plane_order(filter->video_buffer->buffer_format);
     cur_sv = cur->get_sampler_view_components(cur);
     prevprev_sv = prevprev->get_sampler_view_components(prevprev);
     prev_sv = prev->get_sampler_view_components(prev);
@@ -433,7 +461,6 @@ vl_deint_filter_render(struct vl_deint_filter *filter,
  
     /* set up pipe state */
     filter->pipe->bind_rasterizer_state(filter->pipe, filter->rs_state);
-   filter->pipe->bind_blend_state(filter->pipe, filter->blend);
     filter->pipe->set_vertex_buffers(filter->pipe, 0, 1, &filter->quad);
     filter->pipe->bind_vertex_elements_state(filter->pipe, filter->ves);
     filter->pipe->bind_vs_state(filter->pipe, filter->vs);
@@ -449,12 +476,13 @@ vl_deint_filter_render(struct vl_deint_filter *filter,
     fb_state.nr_cbufs = 1;
  
     /* process each plane separately */
-   for (j = 0; j < 3; j++) {
-      /* select correct YV12 surfaces */
-      int k = j == 1 ? 2 :
-              j == 2 ? 1 : 0;
-      struct pipe_surface *blit_surf = dst_surfaces[2 * k + field];
-      struct pipe_surface *dst_surf = dst_surfaces[2 * k + 1 - field];
+   for (i = 0, j = 0; i < VL_NUM_COMPONENTS; ++i) {
+      struct pipe_surface *blit_surf = dst_surfaces[field];
+      struct pipe_surface *dst_surf = dst_surfaces[1 - field];
+      int k = plane_order[i];
+
+      /* bind blend state for this component in the plane */
+      filter->pipe->bind_blend_state(filter->pipe, filter->blend[j]);
  
        /* update render target state */
        viewport.scale[0] = blit_surf->texture->width0;
@@ -463,10 +491,10 @@ vl_deint_filter_render(struct vl_deint_filter *filter,
        fb_state.height = blit_surf->texture->height0;
  
        /* update sampler view sources  */
-      sampler_views[0] = prevprev_sv[j];
-      sampler_views[1] = prev_sv[j];
-      sampler_views[2] = cur_sv[j];
-      sampler_views[3] = next_sv[j];
+      sampler_views[0] = prevprev_sv[k];
+      sampler_views[1] = prev_sv[k];
+      sampler_views[2] = cur_sv[k];
+      sampler_views[3] = next_sv[k];
        filter->pipe->set_sampler_views(filter->pipe, PIPE_SHADER_FRAGMENT, 0, 4, sampler_views);
  
        /* blit current field */
@@ -479,11 +507,16 @@ vl_deint_filter_render(struct vl_deint_filter *filter,
        /* blit or interpolate other field */
        fb_state.cbufs[0] = dst_surf;
        filter->pipe->set_framebuffer_state(filter->pipe, &fb_state);
-      if (j > 0 && filter->skip_chroma) {
+      if (i > 0 && filter->skip_chroma) {
           util_draw_arrays(filter->pipe, PIPE_PRIM_QUADS, 0, 4);
        } else {
           filter->pipe->bind_fs_state(filter->pipe, field ? filter->fs_deint_top : filter->fs_deint_bottom);
           util_draw_arrays(filter->pipe, PIPE_PRIM_QUADS, 0, 4);
        }
+
+      if (++j >= util_format_get_nr_components(dst_surf->format)) {
+         dst_surfaces += 2;
+         j = 0;
+      }
     }
  }
diff --git a/src/gallium/auxiliary/vl/vl_deint_filter.h b/src/gallium/auxiliary/vl/vl_deint_filter.h

index 3ca378b..49cc96c 100644 (file)
--- a/src/gallium/auxiliary/vl/vl_deint_filter.h
+++ b/src/gallium/auxiliary/vl/vl_deint_filter.h
@@ -38,7 +38,7 @@ struct vl_deint_filter
     struct pipe_vertex_buffer quad;
  
     void *rs_state;
-   void *blend;
+   void *blend[3];
     void *sampler[4];
     void *ves;
     void *vs;
diff --git a/src/gallium/auxiliary/vl/vl_mc.c b/src/gallium/auxiliary/vl/vl_mc.c

index 6c317bb..eb703a9 100644 (file)
--- a/src/gallium/auxiliary/vl/vl_mc.c
+++ b/src/gallium/auxiliary/vl/vl_mc.c
@@ -79,14 +79,18 @@ calc_position(struct vl_mc *r, struct ureg_program *shader, struct ureg_src bloc
  }
  
  static struct ureg_dst
-calc_line(struct ureg_program *shader)
+calc_line(struct pipe_screen *screen, struct ureg_program *shader)
  {
     struct ureg_dst tmp;
     struct ureg_src pos;
  
     tmp = ureg_DECL_temporary(shader);
  
-   pos = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS, TGSI_INTERPOLATE_LINEAR);
+   if (screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL))
+      pos = ureg_DECL_system_value(shader, TGSI_SEMANTIC_POSITION, 0);
+   else
+      pos = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS,
+                               TGSI_INTERPOLATE_LINEAR);
  
     /*
      * tmp.y = fraction(pos.y / 2) >= 0.5 ? 1 : 0
@@ -177,7 +181,7 @@ create_ref_frag_shader(struct vl_mc *r)
  
     fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
  
-   field = calc_line(shader);
+   field = calc_line(r->pipe->screen, shader);
  
     /*
      * ref = field.z ? tc[1] : tc[0]
@@ -324,7 +328,7 @@ create_ycbcr_frag_shader(struct vl_mc *r, float scale, bool invert,
  
     fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
  
-   tmp = calc_line(shader);
+   tmp = calc_line(r->pipe->screen, shader);
  
     /*
      * if (field == tc.w)
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c

index f5bb3a0..b5c7045 100644 (file)
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -792,7 +792,7 @@ vl_mpeg12_end_frame(struct pipe_video_codec *decoder,
        for (j = 0; j < VL_MAX_REF_FRAMES; ++j) {
           if (!ref_frames[j] || !ref_frames[j][i]) continue;
  
-         vb[2] = vl_vb_get_mv(&buf->vertex_stream, j);;
+         vb[2] = vl_vb_get_mv(&buf->vertex_stream, j);
           dec->context->set_vertex_buffers(dec->context, 0, 3, vb);
  
           vl_mc_render_ref(i ? &dec->mc_c : &dec->mc_y, &buf->mc[i], ref_frames[j][i]);
diff --git a/src/gallium/auxiliary/vl/vl_video_buffer.c b/src/gallium/auxiliary/vl/vl_video_buffer.c

index e8cd24d..462fdcb 100644 (file)
--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@@ -253,14 +253,8 @@ vl_video_buffer_template(struct pipe_resource *templ,
     templ->bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
     templ->usage = usage;
  
-   if (plane > 0) {
-      if (tmpl->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) {
-         templ->width0 /= 2;
-         templ->height0 /= 2;
-      } else if (tmpl->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422) {
-         templ->width0 /= 2;
-      }
-   }
+   vl_video_buffer_adjust_size(&templ->width0, &templ->height0, plane,
+                               tmpl->chroma_format, false);
  }
  
  static void
diff --git a/src/gallium/auxiliary/vl/vl_video_buffer.h b/src/gallium/auxiliary/vl/vl_video_buffer.h

index 488c3cc..8a1c077 100644 (file)
--- a/src/gallium/auxiliary/vl/vl_video_buffer.h
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.h
@@ -48,6 +48,24 @@ struct vl_video_buffer
     struct pipe_surface      *surfaces[VL_MAX_SURFACES];
  };
  
+static inline void
+vl_video_buffer_adjust_size(unsigned *width, unsigned *height, unsigned plane,
+                            enum pipe_video_chroma_format chroma_format,
+                            bool interlaced)
+{
+   if (interlaced) {
+      *height /= 2;
+   }
+   if (plane > 0) {
+      if (chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) {
+         *width /= 2;
+         *height /= 2;
+      } else if (chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422) {
+         *width /= 2;
+      }
+   }
+}
+
  /**
   * get subformats for each plane
   */
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst

index 9a32716..4c03e00 100644 (file)
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -648,3 +648,14 @@ In addition, normal texture sampling is allowed from the compute
  program: ``bind_sampler_states`` may be used to set up texture
  samplers for the compute stage and ``set_sampler_views`` may
  be used to bind a number of sampler views to it.
+
+Mipmap generation
+^^^^^^^^^^^^^^^^^
+
+If PIPE_CAP_GENERATE_MIPMAP is true, ``generate_mipmap`` can be used
+to generate mipmaps for the specified texture resource.
+It replaces texel image levels base_level+1 through
+last_level for layers range from first_layer through last_layer.
+It returns TRUE if mipmap generation succeeds, otherwise it
+returns FALSE. Mipmap generation may fail when it is not supported
+for particular texture types or formats.
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst

index e900283..d7ea123 100644 (file)
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -213,6 +213,11 @@ The integer capabilities:
  * ``PIPE_CAP_DRAW_INDIRECT``: Whether the driver supports taking draw arguments
    { count, instance_count, start, index_bias } from a PIPE_BUFFER resource.
    See pipe_draw_info.
+* ``PIPE_CAP_MULTI_DRAW_INDIRECT``: Whether the driver supports
+  pipe_draw_info::indirect_stride and ::indirect_count
+* ``PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS``: Whether the driver supports
+  taking the number of indirect draws from a separate parameter
+  buffer, see pipe_draw_info::indirect_params.
  * ``PIPE_CAP_TGSI_FS_FINE_DERIVATIVE``: Whether the fragment shader supports
    the FINE versions of DDX/DDY.
  * ``PIPE_CAP_VENDOR_ID``: The vendor ID of the underlying hardware. If it's
@@ -239,8 +244,7 @@ The integer capabilities:
    will need to lower TGSI_SEMANTIC_VERTEXID to TGSI_SEMANTIC_VERTEXID_NOBASE
    and TGSI_SEMANTIC_BASEVERTEX, so drivers setting this must handle both these
    semantics. Only relevant if geometry shaders are supported.
-  (Currently not possible to query availability of these two semantics outside
-  this, at least BASEVERTEX should be exposed separately too).
+  (BASEVERTEX could be exposed separately too via ``PIPE_CAP_DRAW_PARAMETERS``).
  * ``PIPE_CAP_POLYGON_OFFSET_CLAMP``: If true, the driver implements support
    for ``pipe_rasterizer_state::offset_clamp``.
  * ``PIPE_CAP_MULTISAMPLE_Z_RESOLVE``: Whether the driver supports blitting
@@ -283,6 +287,24 @@ The integer capabilities:
    a compressed block is copied to/from a plain pixel of the same size.
  * ``PIPE_CAP_CLEAR_TEXTURE``: Whether `clear_texture` will be
    available in contexts.
+* ``PIPE_CAP_DRAW_PARAMETERS``: Whether ``TGSI_SEMANTIC_BASEVERTEX``,
+  ``TGSI_SEMANTIC_BASEINSTANCE``, and ``TGSI_SEMANTIC_DRAWID`` are
+  supported in vertex shaders.
+* ``PIPE_CAP_TGSI_PACK_HALF_FLOAT``: Whether the ``UP2H`` and ``PK2H``
+  TGSI opcodes are supported.
+* ``PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL``: If state trackers should use
+  a system value for the POSITION fragment shader input.
+* ``PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL``: If state trackers should use
+  a system value for the FACE fragment shader input.
+  Also, the FACE system value is integer, not float.
+* ``PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT``: Describes the required
+  alignment for pipe_shader_buffer::buffer_offset, in bytes. Maximum
+  value allowed is 256 (for GL conformance). 0 is only allowed if
+  shader buffers are not supported.
+* ``PIPE_CAP_INVALIDATE_BUFFER``: Whether the use of ``invalidate_resource``
+  for buffers is supported.
+* ``PIPE_CAP_GENERATE_MIPMAP``: Indicates whether pipe_context::generate_mipmap
+  is supported.
  
  
  .. _pipe_capf:
@@ -375,6 +397,10 @@ to be 0.
    of iterations that loops are allowed to have to be unrolled. It is only
    a hint to state trackers. Whether any loops will be unrolled is not
    guaranteed.
+* ``PIPE_SHADER_CAP_MAX_SHADER_BUFFERS``: Maximum number of memory buffers
+  (also used to implement atomic counters). Having this be non-0 also
+  implies support for the ``LOAD``, ``STORE``, and ``ATOM*`` TGSI
+  opcodes.
  
  
  .. _pipe_compute_cap:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst

index e7b0c2f..7810a3e 100644 (file)
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -458,7 +458,11 @@ while DDY is allowed to be the same for the entire 2x2 quad.
  
  .. opcode:: PK2H - Pack Two 16-bit Floats
  
-  TBD
+This instruction replicates its result.
+
+.. math::
+
+  dst = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16
  
  
  .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars
@@ -615,7 +619,15 @@ This instruction replicates its result.
  
  .. opcode:: UP2H - Unpack Two 16-Bit Floats
  
-  TBD
+.. math::
+
+  dst.x = f16\_to\_f32(src0.x \& 0xffff)
+
+  dst.y = f16\_to\_f32(src0.x >> 16)
+
+  dst.z = f16\_to\_f32(src0.x \& 0xffff)
+
+  dst.w = f16\_to\_f32(src0.x >> 16)
  
  .. note::
  
@@ -2252,11 +2264,11 @@ after lookup.
  Resource Access Opcodes
  ^^^^^^^^^^^^^^^^^^^^^^^
  
-.. opcode:: LOAD - Fetch data from a shader resource
+.. opcode:: LOAD - Fetch data from a shader buffer or image
  
                 Syntax: ``LOAD dst, resource, address``
  
-               Example: ``LOAD TEMP[0], RES[0], TEMP[1]``
+               Example: ``LOAD TEMP[0], BUFFER[0], TEMP[1]``
  
                 Using the provided integer address, LOAD fetches data
                 from the specified buffer or texture without any
@@ -2280,7 +2292,7 @@ Resource Access Opcodes
  
                 Syntax: ``STORE resource, address, src``
  
-               Example: ``STORE RES[0], TEMP[0], TEMP[1]``
+               Example: ``STORE BUFFER[0], TEMP[0], TEMP[1]``
  
                 Using the provided integer address, STORE writes data
                 to the specified buffer or texture.
@@ -2299,6 +2311,18 @@ Resource Access Opcodes
                 texture arrays and 2D textures.  address.w is always
                 ignored.
  
+.. opcode:: RESQ - Query information about a resource
+
+  Syntax: ``RESQ dst, resource``
+
+  Example: ``RESQ TEMP[0], BUFFER[0]``
+
+  Returns information about the buffer or image resource. For buffer
+  resources, the size (in bytes) is returned in the x component. For
+  image resources, .xyz will contain the width/height/layers of the
+  image, while .w will contain the number of samples for multi-sampled
+  images.
+
  
  .. _threadsyncopcodes:
  
@@ -2358,158 +2382,159 @@ These opcodes provide atomic variants of some common arithmetic and
  logical operations.  In this context atomicity means that another
  concurrent memory access operation that affects the same memory
  location is guaranteed to be performed strictly before or after the
-entire execution of the atomic operation.
-
-For the moment they're only valid in compute programs.
+entire execution of the atomic operation. The resource may be a buffer
+or an image. In the case of an image, the offset works the same as for
+``LOAD`` and ``STORE``, specified above. These atomic operations may
+only be used with 32-bit integer image formats.
  
  .. opcode:: ATOMUADD - Atomic integer addition
  
    Syntax: ``ATOMUADD dst, resource, offset, src``
  
-  Example: ``ATOMUADD TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUADD TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = dst_i + src_i
+  resource[offset] = dst_x + src_x
  
  
  .. opcode:: ATOMXCHG - Atomic exchange
  
    Syntax: ``ATOMXCHG dst, resource, offset, src``
  
-  Example: ``ATOMXCHG TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMXCHG TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = src_i
+  resource[offset] = src_x
  
  
  .. opcode:: ATOMCAS - Atomic compare-and-exchange
  
    Syntax: ``ATOMCAS dst, resource, offset, cmp, src``
  
-  Example: ``ATOMCAS TEMP[0], RES[0], TEMP[1], TEMP[2], TEMP[3]``
+  Example: ``ATOMCAS TEMP[0], BUFFER[0], TEMP[1], TEMP[2], TEMP[3]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = (dst_i == cmp_i ? src_i : dst_i)
+  resource[offset] = (dst_x == cmp_x ? src_x : dst_x)
  
  
  .. opcode:: ATOMAND - Atomic bitwise And
  
    Syntax: ``ATOMAND dst, resource, offset, src``
  
-  Example: ``ATOMAND TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMAND TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = dst_i \& src_i
+  resource[offset] = dst_x \& src_x
  
  
  .. opcode:: ATOMOR - Atomic bitwise Or
  
    Syntax: ``ATOMOR dst, resource, offset, src``
  
-  Example: ``ATOMOR TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMOR TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = dst_i | src_i
+  resource[offset] = dst_x | src_x
  
  
  .. opcode:: ATOMXOR - Atomic bitwise Xor
  
    Syntax: ``ATOMXOR dst, resource, offset, src``
  
-  Example: ``ATOMXOR TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMXOR TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = dst_i \oplus src_i
+  resource[offset] = dst_x \oplus src_x
  
  
  .. opcode:: ATOMUMIN - Atomic unsigned minimum
  
    Syntax: ``ATOMUMIN dst, resource, offset, src``
  
-  Example: ``ATOMUMIN TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUMIN TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = (dst_i < src_i ? dst_i : src_i)
+  resource[offset] = (dst_x < src_x ? dst_x : src_x)
  
  
  .. opcode:: ATOMUMAX - Atomic unsigned maximum
  
    Syntax: ``ATOMUMAX dst, resource, offset, src``
  
-  Example: ``ATOMUMAX TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMUMAX TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = (dst_i > src_i ? dst_i : src_i)
+  resource[offset] = (dst_x > src_x ? dst_x : src_x)
  
  
  .. opcode:: ATOMIMIN - Atomic signed minimum
  
    Syntax: ``ATOMIMIN dst, resource, offset, src``
  
-  Example: ``ATOMIMIN TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMIMIN TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = (dst_i < src_i ? dst_i : src_i)
+  resource[offset] = (dst_x < src_x ? dst_x : src_x)
  
  
  .. opcode:: ATOMIMAX - Atomic signed maximum
  
    Syntax: ``ATOMIMAX dst, resource, offset, src``
  
-  Example: ``ATOMIMAX TEMP[0], RES[0], TEMP[1], TEMP[2]``
+  Example: ``ATOMIMAX TEMP[0], BUFFER[0], TEMP[1], TEMP[2]``
  
-  The following operation is performed atomically on each component:
+  The following operation is performed atomically:
  
  .. math::
  
-  dst_i = resource[offset]_i
+  dst_x = resource[offset]
  
-  resource[offset]_i = (dst_i > src_i ? dst_i : src_i)
+  resource[offset] = (dst_x > src_x ? dst_x : src_x)
  
  
  
@@ -2646,7 +2671,8 @@ space coordinate system.  After clipping, the X, Y and Z components of the
  vertex will be divided by the W value to get normalized device coordinates.
  
  For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
-fragment shader input contains the fragment's window position.  The X
+fragment shader input (or system value, depending on which one is
+supported by the driver) contains the fragment's window position.  The X
  component starts at zero and always increases from left to right.
  The Y component starts at zero and always increases but Y=0 may either
  indicate the top of the window or the bottom depending on the fragment
@@ -2758,11 +2784,17 @@ typically only used for legacy graphics APIs.
  TGSI_SEMANTIC_FACE
  """"""""""""""""""
  
-This label applies to fragment shader inputs only and indicates that
-the register contains front/back-face information of the form (F, 0,
-0, 1).  The first component will be positive when the fragment belongs
-to a front-facing polygon, and negative when the fragment belongs to a
-back-facing polygon.
+This label applies to fragment shader inputs (or system values,
+depending on which one is supported by the driver) and indicates that
+the register contains front/back-face information.
+
+If it is an input, it will be a floating-point vector in the form (F, 0, 0, 1),
+where F will be positive when the fragment belongs to a front-facing polygon,
+and negative when the fragment belongs to a back-facing polygon.
+
+If it is a system value, it will be an integer vector in the form (F, 0, 0, 1),
+where F is 0xffffffff when the fragment belongs to a front-facing polygon and
+0 when the fragment belongs to a back-facing polygon.
  
  
  TGSI_SEMANTIC_EDGEFLAG
@@ -2949,6 +2981,19 @@ invocation is covered or not. Helper invocations are created in order
  to properly compute derivatives, however it may be desirable to skip
  some of the logic in those cases. See ``gl_HelperInvocation`` documentation.
  
+TGSI_SEMANTIC_BASEINSTANCE
+""""""""""""""""""""""""""
+
+For vertex shaders, the base instance argument supplied for this
+draw. This is an integer value, and only the X component is used.
+
+TGSI_SEMANTIC_DRAWID
+""""""""""""""""""""
+
+For vertex shaders, the zero-based index of the current draw in a
+``glMultiDraw*`` invocation. This is an integer value, and only the X
+component is used.
+
  
  Declaration Interpolate
  ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/src/gallium/drivers/freedreno/.gitignore b/src/gallium/drivers/freedreno/.gitignore

new file mode 100644 (file)

index 0000000..150f5d1
--- /dev/null
+++ b/src/gallium/drivers/freedreno/.gitignore
@@ -0,0 +1 @@
+ir3_compiler
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources

index baae914..74ef416 100644 (file)
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -128,6 +128,7 @@ ir3_SOURCES := \
         ir3/ir3_group.c \
         ir3/ir3.h \
         ir3/ir3_legalize.c \
+       ir3/ir3_nir.c \
         ir3/ir3_nir.h \
         ir3/ir3_nir_lower_if_else.c \
         ir3/ir3_print.c \
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h

index 77f708f..d231113 100644 (file)
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
  - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
  - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
  
  Copyright (C) 2013-2015 by the following authors:
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h

index a6940df..c4f253b 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
  - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
  - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
  
  Copyright (C) 2013-2015 by the following authors:
@@ -1421,15 +1421,23 @@ static inline uint32_t A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(enum adreno_pa_
  #define REG_A3XX_PC_RESTART_INDEX                              0x000021ed
  
  #define REG_A3XX_HLSQ_CONTROL_0_REG                            0x00002200
-#define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK             0x00000010
+#define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK             0x00000030
  #define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT            4
  static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(enum a3xx_threadsize val)
  {
         return ((val) << A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT) & A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK;
  }
  #define A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE            0x00000040
+#define A3XX_HLSQ_CONTROL_0_REG_COMPUTEMODE                    0x00000100
  #define A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART                        0x00000200
  #define A3XX_HLSQ_CONTROL_0_REG_RESERVED2                      0x00000400
+#define A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__MASK     0x00fff000
+#define A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__SHIFT    12
+static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC(uint32_t val)
+{
+       return ((val) << A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__SHIFT) & A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__MASK;
+}
+#define A3XX_HLSQ_CONTROL_0_REG_FSONLYTEX                      0x02000000
  #define A3XX_HLSQ_CONTROL_0_REG_CHUNKDISABLE                   0x04000000
  #define A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__MASK                        0x08000000
  #define A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__SHIFT               27
@@ -1443,17 +1451,39 @@ static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(uint32_t val)
  #define A3XX_HLSQ_CONTROL_0_REG_SINGLECONTEXT                  0x80000000
  
  #define REG_A3XX_HLSQ_CONTROL_1_REG                            0x00002201
-#define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK             0x00000040
+#define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK             0x000000c0
  #define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__SHIFT            6
  static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(enum a3xx_threadsize val)
  {
         return ((val) << A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK;
  }
  #define A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE            0x00000100
-#define A3XX_HLSQ_CONTROL_1_REG_RESERVED1                      0x00000200
-#define A3XX_HLSQ_CONTROL_1_REG_ZWCOORD                                0x02000000
+#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__MASK         0x00ff0000
+#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__SHIFT                16
+static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(uint32_t val)
+{
+       return ((val) << A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__MASK;
+}
+#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__MASK         0xff000000
+#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__SHIFT                24
+static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(uint32_t val)
+{
+       return ((val) << A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__MASK;
+}
  
  #define REG_A3XX_HLSQ_CONTROL_2_REG                            0x00002202
+#define A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__MASK            0x000003fc
+#define A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__SHIFT           2
+static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID(uint32_t val)
+{
+       return ((val) << A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__SHIFT) & A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__MASK;
+}
+#define A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__MASK            0x03fc0000
+#define A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__SHIFT           18
+static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID(uint32_t val)
+{
+       return ((val) << A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__SHIFT) & A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__MASK;
+}
  #define A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD__MASK       0xfc000000
  #define A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD__SHIFT      26
  static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(uint32_t val)
@@ -1470,13 +1500,13 @@ static inline uint32_t A3XX_HLSQ_CONTROL_3_REG_REGID(uint32_t val)
  }
  
  #define REG_A3XX_HLSQ_VS_CONTROL_REG                           0x00002204
-#define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK             0x00000fff
+#define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK             0x000003ff
  #define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT            0
  static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(uint32_t val)
  {
         return ((val) << A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT) & A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK;
  }
-#define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__MASK                0x00fff000
+#define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__MASK                0x001ff000
  #define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__SHIFT       12
  static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(uint32_t val)
  {
@@ -1490,13 +1520,13 @@ static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(uint32_t val)
  }
  
  #define REG_A3XX_HLSQ_FS_CONTROL_REG                           0x00002205
-#define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK             0x00000fff
+#define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK             0x000003ff
  #define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__SHIFT            0
  static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(uint32_t val)
  {
         return ((val) << A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__SHIFT) & A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK;
  }
-#define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__MASK                0x00fff000
+#define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__MASK                0x001ff000
  #define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__SHIFT       12
  static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(uint32_t val)
  {
@@ -1510,13 +1540,13 @@ static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(uint32_t val)
  }
  
  #define REG_A3XX_HLSQ_CONST_VSPRESV_RANGE_REG                  0x00002206
-#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK     0x0000ffff
+#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK     0x000001ff
  #define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__SHIFT    0
  static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY(uint32_t val)
  {
         return ((val) << A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__SHIFT) & A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK;
  }
-#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__MASK       0xffff0000
+#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__MASK       0x01ff0000
  #define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__SHIFT      16
  static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY(uint32_t val)
  {
@@ -1524,13 +1554,13 @@ static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY(uint32_t val)
  }
  
  #define REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG                  0x00002207
-#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK     0x0000ffff
+#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK     0x000001ff
  #define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__SHIFT    0
  static inline uint32_t A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(uint32_t val)
  {
         return ((val) << A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__SHIFT) & A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK;
  }
-#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__MASK       0xffff0000
+#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__MASK       0x01ff0000
  #define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__SHIFT      16
  static inline uint32_t A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(uint32_t val)
  {
@@ -2012,24 +2042,19 @@ static inline uint32_t A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(enum a3xx_instrbuffe
         return ((val) << A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE__SHIFT) & A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE__MASK;
  }
  #define A3XX_SP_VS_CTRL_REG0_CACHEINVALID                      0x00000004
+#define A3XX_SP_VS_CTRL_REG0_ALUSCHMODE                                0x00000008
  #define A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK            0x000003f0
  #define A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT           4
  static inline uint32_t A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
  }
-#define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK            0x0003fc00
+#define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK            0x0000fc00
  #define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT           10
  static inline uint32_t A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK;
  }
-#define A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__MASK             0x000c0000
-#define A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__SHIFT            18
-static inline uint32_t A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(uint32_t val)
-{
-       return ((val) << A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__SHIFT) & A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__MASK;
-}
  #define A3XX_SP_VS_CTRL_REG0_THREADSIZE__MASK                  0x00100000
  #define A3XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT                 20
  static inline uint32_t A3XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
@@ -2037,8 +2062,6 @@ static inline uint32_t A3XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
         return ((val) << A3XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT) & A3XX_SP_VS_CTRL_REG0_THREADSIZE__MASK;
  }
  #define A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE                   0x00200000
-#define A3XX_SP_VS_CTRL_REG0_PIXLODENABLE                      0x00400000
-#define A3XX_SP_VS_CTRL_REG0_COMPUTEMODE                       0x00800000
  #define A3XX_SP_VS_CTRL_REG0_LENGTH__MASK                      0xff000000
  #define A3XX_SP_VS_CTRL_REG0_LENGTH__SHIFT                     24
  static inline uint32_t A3XX_SP_VS_CTRL_REG0_LENGTH(uint32_t val)
@@ -2079,7 +2102,8 @@ static inline uint32_t A3XX_SP_VS_PARAM_REG_PSIZEREGID(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_PARAM_REG_PSIZEREGID__SHIFT) & A3XX_SP_VS_PARAM_REG_PSIZEREGID__MASK;
  }
-#define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__MASK               0xfff00000
+#define A3XX_SP_VS_PARAM_REG_POS2DMODE                         0x00010000
+#define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__MASK               0x01f00000
  #define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__SHIFT              20
  static inline uint32_t A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(uint32_t val)
  {
@@ -2089,24 +2113,26 @@ static inline uint32_t A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(uint32_t val)
  static inline uint32_t REG_A3XX_SP_VS_OUT(uint32_t i0) { return 0x000022c7 + 0x1*i0; }
  
  static inline uint32_t REG_A3XX_SP_VS_OUT_REG(uint32_t i0) { return 0x000022c7 + 0x1*i0; }
-#define A3XX_SP_VS_OUT_REG_A_REGID__MASK                       0x000001ff
+#define A3XX_SP_VS_OUT_REG_A_REGID__MASK                       0x000000ff
  #define A3XX_SP_VS_OUT_REG_A_REGID__SHIFT                      0
  static inline uint32_t A3XX_SP_VS_OUT_REG_A_REGID(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_OUT_REG_A_REGID__SHIFT) & A3XX_SP_VS_OUT_REG_A_REGID__MASK;
  }
+#define A3XX_SP_VS_OUT_REG_A_HALF                              0x00000100
  #define A3XX_SP_VS_OUT_REG_A_COMPMASK__MASK                    0x00001e00
  #define A3XX_SP_VS_OUT_REG_A_COMPMASK__SHIFT                   9
  static inline uint32_t A3XX_SP_VS_OUT_REG_A_COMPMASK(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_OUT_REG_A_COMPMASK__SHIFT) & A3XX_SP_VS_OUT_REG_A_COMPMASK__MASK;
  }
-#define A3XX_SP_VS_OUT_REG_B_REGID__MASK                       0x01ff0000
+#define A3XX_SP_VS_OUT_REG_B_REGID__MASK                       0x00ff0000
  #define A3XX_SP_VS_OUT_REG_B_REGID__SHIFT                      16
  static inline uint32_t A3XX_SP_VS_OUT_REG_B_REGID(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_OUT_REG_B_REGID__SHIFT) & A3XX_SP_VS_OUT_REG_B_REGID__MASK;
  }
+#define A3XX_SP_VS_OUT_REG_B_HALF                              0x01000000
  #define A3XX_SP_VS_OUT_REG_B_COMPMASK__MASK                    0x1e000000
  #define A3XX_SP_VS_OUT_REG_B_COMPMASK__SHIFT                   25
  static inline uint32_t A3XX_SP_VS_OUT_REG_B_COMPMASK(uint32_t val)
@@ -2117,25 +2143,25 @@ static inline uint32_t A3XX_SP_VS_OUT_REG_B_COMPMASK(uint32_t val)
  static inline uint32_t REG_A3XX_SP_VS_VPC_DST(uint32_t i0) { return 0x000022d0 + 0x1*i0; }
  
  static inline uint32_t REG_A3XX_SP_VS_VPC_DST_REG(uint32_t i0) { return 0x000022d0 + 0x1*i0; }
-#define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK                   0x000000ff
+#define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK                   0x0000007f
  #define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__SHIFT                  0
  static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC0(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC0__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK;
  }
-#define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK                   0x0000ff00
+#define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK                   0x00007f00
  #define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__SHIFT                  8
  static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC1(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC1__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK;
  }
-#define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK                   0x00ff0000
+#define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK                   0x007f0000
  #define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__SHIFT                  16
  static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC2(uint32_t val)
  {
         return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC2__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK;
  }
-#define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__MASK                   0xff000000
+#define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__MASK                   0x7f000000
  #define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__SHIFT                  24
  static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC3(uint32_t val)
  {
@@ -2143,6 +2169,12 @@ static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC3(uint32_t val)
  }
  
  #define REG_A3XX_SP_VS_OBJ_OFFSET_REG                          0x000022d4
+#define A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK   0x0000ffff
+#define A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT  0
+static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET(uint32_t val)
+{
+       return ((val) << A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT) & A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK;
+}
  #define A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK      0x01ff0000
  #define A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT     16
  static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(uint32_t val)
@@ -2159,8 +2191,38 @@ static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
  #define REG_A3XX_SP_VS_OBJ_START_REG                           0x000022d5
  
  #define REG_A3XX_SP_VS_PVT_MEM_PARAM_REG                       0x000022d6
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK      0x000000ff
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT     0
+static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM(uint32_t val)
+{
+       return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK;
+}
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK       0x00ffff00
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT      8
+static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET(uint32_t val)
+{
+       return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK;
+}
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK        0xff000000
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT       24
+static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD(uint32_t val)
+{
+       return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK;
+}
  
  #define REG_A3XX_SP_VS_PVT_MEM_ADDR_REG                                0x000022d7
+#define A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__MASK             0x0000001f
+#define A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT            0
+static inline uint32_t A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN(uint32_t val)
+{
+       return ((val) << A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT) & A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__MASK;
+}
+#define A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK   0xffffffe0
+#define A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT  5
+static inline uint32_t A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS(uint32_t val)
+{
+       return ((val >> 5) << A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT) & A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK;
+}
  
  #define REG_A3XX_SP_VS_PVT_MEM_SIZE_REG                                0x000022d8
  
@@ -2186,24 +2248,22 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(enum a3xx_instrbuffe
         return ((val) << A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE__SHIFT) & A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE__MASK;
  }
  #define A3XX_SP_FS_CTRL_REG0_CACHEINVALID                      0x00000004
+#define A3XX_SP_FS_CTRL_REG0_ALUSCHMODE                                0x00000008
  #define A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK            0x000003f0
  #define A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT           4
  static inline uint32_t A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
  {
         return ((val) << A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
  }
-#define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK            0x0003fc00
+#define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK            0x0000fc00
  #define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT           10
  static inline uint32_t A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
  {
         return ((val) << A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK;
  }
-#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__MASK             0x000c0000
-#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__SHIFT            18
-static inline uint32_t A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(uint32_t val)
-{
-       return ((val) << A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__SHIFT) & A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__MASK;
-}
+#define A3XX_SP_FS_CTRL_REG0_FSBYPASSENABLE                    0x00020000
+#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP                   0x00040000
+#define A3XX_SP_FS_CTRL_REG0_OUTORDERED                                0x00080000
  #define A3XX_SP_FS_CTRL_REG0_THREADSIZE__MASK                  0x00100000
  #define A3XX_SP_FS_CTRL_REG0_THREADSIZE__SHIFT                 20
  static inline uint32_t A3XX_SP_FS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
@@ -2239,7 +2299,7 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(uint32_t val)
  {
         return ((val) << A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING__SHIFT) & A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING__MASK;
  }
-#define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__MASK           0x3f000000
+#define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__MASK           0x7f000000
  #define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__SHIFT          24
  static inline uint32_t A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(uint32_t val)
  {
@@ -2247,6 +2307,12 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(uint32_t val)
  }
  
  #define REG_A3XX_SP_FS_OBJ_OFFSET_REG                          0x000022e2
+#define A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK   0x0000ffff
+#define A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT  0
+static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET(uint32_t val)
+{
+       return ((val) << A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT) & A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK;
+}
  #define A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK      0x01ff0000
  #define A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT     16
  static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(uint32_t val)
@@ -2263,8 +2329,38 @@ static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
  #define REG_A3XX_SP_FS_OBJ_START_REG                           0x000022e3
  
  #define REG_A3XX_SP_FS_PVT_MEM_PARAM_REG                       0x000022e4
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK      0x000000ff
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT     0
+static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM(uint32_t val)
+{
+       return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK;
+}
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK       0x00ffff00
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT      8
+static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET(uint32_t val)
+{
+       return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK;
+}
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK        0xff000000
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT       24
+static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD(uint32_t val)
+{
+       return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK;
+}
  
  #define REG_A3XX_SP_FS_PVT_MEM_ADDR_REG                                0x000022e5
+#define A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__MASK             0x0000001f
+#define A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT            0
+static inline uint32_t A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN(uint32_t val)
+{
+       return ((val) << A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT) & A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__MASK;
+}
+#define A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK   0xffffffe0
+#define A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT  5
+static inline uint32_t A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS(uint32_t val)
+{
+       return ((val >> 5) << A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT) & A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK;
+}
  
  #define REG_A3XX_SP_FS_PVT_MEM_SIZE_REG                                0x000022e6
  
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c

index 74cbbf2..e47bbff 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -171,8 +171,8 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
  
         fd3_query_context_init(pctx);
  
-       fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-                       2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+       fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0,
+                                                         PIPE_USAGE_STREAM);
  
         return pctx;
  }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c

index 24afbc9..e65a352 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -145,7 +145,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
         void *ptr;
  
         u_upload_alloc(fd3_ctx->border_color_uploader,
-                       0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+                       0, BORDER_COLOR_UPLOAD_SIZE,
+                      BORDER_COLOR_UPLOAD_SIZE, &off,
                         &fd3_ctx->border_color_buf,
                         &ptr);
  
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c

index 7361516..a64ecf1 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -229,7 +229,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
                         A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
         OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
                         A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
-                       COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_ZWCOORD));
+                       COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(regid(0,0)) |
+                                       A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(regid(0,2))));
         OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31));
         OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid));
         OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) |
@@ -254,10 +255,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
                         COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) |
                         A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
                         A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
-                       A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
                         A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
                         A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
-                       COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) |
                         A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz));
         OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) |
                         A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) |
@@ -336,7 +335,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
                                 COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) |
                                 A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
                                 A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
-                               A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
+                               A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP |
                                 A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
                                 A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
                                 COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h

index a450379..e8df429 100644 (file)
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
  - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
  - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
  
  Copyright (C) 2013-2015 by the following authors:
@@ -157,58 +157,62 @@ enum a4xx_vtx_fmt {
         VFMT4_10_10_10_2_UNORM = 57,
         VFMT4_10_10_10_2_SINT = 58,
         VFMT4_10_10_10_2_SNORM = 59,
+       VFMT4_2_10_10_10_UINT = 60,
+       VFMT4_2_10_10_10_UNORM = 61,
+       VFMT4_2_10_10_10_SINT = 62,
+       VFMT4_2_10_10_10_SNORM = 63,
  };
  
  enum a4xx_tex_fmt {
-       TFMT4_5_6_5_UNORM = 11,
-       TFMT4_5_5_5_1_UNORM = 9,
-       TFMT4_4_4_4_4_UNORM = 8,
-       TFMT4_X8Z24_UNORM = 71,
-       TFMT4_10_10_10_2_UNORM = 33,
-       TFMT4_10_10_10_2_UINT = 34,
         TFMT4_A8_UNORM = 3,
-       TFMT4_L8_A8_UNORM = 13,
         TFMT4_8_UNORM = 4,
-       TFMT4_8_8_UNORM = 14,
-       TFMT4_8_8_8_8_UNORM = 28,
         TFMT4_8_SNORM = 5,
-       TFMT4_8_8_SNORM = 15,
-       TFMT4_8_8_8_8_SNORM = 29,
         TFMT4_8_UINT = 6,
-       TFMT4_8_8_UINT = 16,
-       TFMT4_8_8_8_8_UINT = 30,
         TFMT4_8_SINT = 7,
+       TFMT4_4_4_4_4_UNORM = 8,
+       TFMT4_5_5_5_1_UNORM = 9,
+       TFMT4_5_6_5_UNORM = 11,
+       TFMT4_L8_A8_UNORM = 13,
+       TFMT4_8_8_UNORM = 14,
+       TFMT4_8_8_SNORM = 15,
+       TFMT4_8_8_UINT = 16,
         TFMT4_8_8_SINT = 17,
-       TFMT4_8_8_8_8_SINT = 31,
         TFMT4_16_UNORM = 18,
-       TFMT4_16_16_UNORM = 38,
-       TFMT4_16_16_16_16_UNORM = 51,
         TFMT4_16_SNORM = 19,
-       TFMT4_16_16_SNORM = 39,
-       TFMT4_16_16_16_16_SNORM = 52,
+       TFMT4_16_FLOAT = 20,
         TFMT4_16_UINT = 21,
-       TFMT4_16_16_UINT = 41,
-       TFMT4_16_16_16_16_UINT = 54,
         TFMT4_16_SINT = 22,
+       TFMT4_8_8_8_8_UNORM = 28,
+       TFMT4_8_8_8_8_SNORM = 29,
+       TFMT4_8_8_8_8_UINT = 30,
+       TFMT4_8_8_8_8_SINT = 31,
+       TFMT4_9_9_9_E5_FLOAT = 32,
+       TFMT4_10_10_10_2_UNORM = 33,
+       TFMT4_10_10_10_2_UINT = 34,
+       TFMT4_11_11_10_FLOAT = 37,
+       TFMT4_16_16_UNORM = 38,
+       TFMT4_16_16_SNORM = 39,
+       TFMT4_16_16_FLOAT = 40,
+       TFMT4_16_16_UINT = 41,
         TFMT4_16_16_SINT = 42,
-       TFMT4_16_16_16_16_SINT = 55,
+       TFMT4_32_FLOAT = 43,
         TFMT4_32_UINT = 44,
-       TFMT4_32_32_UINT = 57,
-       TFMT4_32_32_32_32_UINT = 64,
         TFMT4_32_SINT = 45,
-       TFMT4_32_32_SINT = 58,
-       TFMT4_32_32_32_32_SINT = 65,
-       TFMT4_16_FLOAT = 20,
-       TFMT4_16_16_FLOAT = 40,
+       TFMT4_16_16_16_16_UNORM = 51,
+       TFMT4_16_16_16_16_SNORM = 52,
         TFMT4_16_16_16_16_FLOAT = 53,
-       TFMT4_32_FLOAT = 43,
+       TFMT4_16_16_16_16_UINT = 54,
+       TFMT4_16_16_16_16_SINT = 55,
         TFMT4_32_32_FLOAT = 56,
-       TFMT4_32_32_32_32_FLOAT = 63,
+       TFMT4_32_32_UINT = 57,
+       TFMT4_32_32_SINT = 58,
         TFMT4_32_32_32_FLOAT = 59,
         TFMT4_32_32_32_UINT = 60,
         TFMT4_32_32_32_SINT = 61,
-       TFMT4_9_9_9_E5_FLOAT = 32,
-       TFMT4_11_11_10_FLOAT = 37,
+       TFMT4_32_32_32_32_FLOAT = 63,
+       TFMT4_32_32_32_32_UINT = 64,
+       TFMT4_32_32_32_32_SINT = 65,
+       TFMT4_X8Z24_UNORM = 71,
         TFMT4_DXT1 = 86,
         TFMT4_DXT3 = 87,
         TFMT4_DXT5 = 88,
@@ -800,6 +804,7 @@ static inline uint32_t A4XX_RB_DEPTH_CONTROL_ZFUNC(enum adreno_compare_func val)
  }
  #define A4XX_RB_DEPTH_CONTROL_BF_ENABLE                                0x00000080
  #define A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE                  0x00010000
+#define A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS                        0x00020000
  #define A4XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE                    0x80000000
  
  #define REG_A4XX_RB_DEPTH_CLEAR                                        0x00002102
@@ -1060,6 +1065,9 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_TP_REG(uint32_t i0) { return 0x
  
  #define REG_A4XX_RBBM_CFG_DEBBUS_SEL_D                         0x0000004d
  
+#define REG_A4XX_RBBM_POWER_CNTL_IP                            0x00000098
+#define A4XX_RBBM_POWER_CNTL_IP_SW_COLLAPSE                    0x00000001
+
  #define REG_A4XX_RBBM_PERFCTR_CP_0_LO                          0x0000009c
  
  static inline uint32_t REG_A4XX_RBBM_CLOCK_CTL_SP(uint32_t i0) { return 0x00000068 + 0x1*i0; }
@@ -1110,6 +1118,10 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1(uint32_t i0) { r
  
  static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1_REG(uint32_t i0) { return 0x0000008e + 0x1*i0; }
  
+#define REG_A4XX_RBBM_SP_REGFILE_SLEEP_CNTL_0                  0x00000099
+
+#define REG_A4XX_RBBM_SP_REGFILE_SLEEP_CNTL_1                  0x0000009a
+
  #define REG_A4XX_RBBM_PERFCTR_PWR_1_LO                         0x00000168
  
  #define REG_A4XX_RBBM_PERFCTR_CTL                              0x00000170
@@ -1163,6 +1175,11 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1_REG(uint32_t i0)
  
  #define REG_A4XX_RBBM_INTERFACE_RRDY_STATUS5                   0x0000019f
  
+#define REG_A4XX_RBBM_POWER_STATUS                             0x000001b0
+#define A4XX_RBBM_POWER_STATUS_SP_TP_PWR_ON                    0x00100000
+
+#define REG_A4XX_RBBM_WAIT_IDLE_CLOCKS_CTL2                    0x000001b8
+
  #define REG_A4XX_CP_SCRATCH_UMASK                              0x00000228
  
  #define REG_A4XX_CP_SCRATCH_ADDR                               0x00000229
@@ -1265,6 +1282,28 @@ static inline uint32_t REG_A4XX_CP_SCRATCH_REG(uint32_t i0) { return 0x00000578
  
  #define REG_A4XX_SP_MODE_CONTROL                               0x00000ec3
  
+#define REG_A4XX_SP_PERFCTR_SP_SEL_0                           0x00000ec4
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_1                           0x00000ec5
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_2                           0x00000ec6
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_3                           0x00000ec7
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_4                           0x00000ec8
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_5                           0x00000ec9
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_6                           0x00000eca
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_7                           0x00000ecb
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_8                           0x00000ecc
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_9                           0x00000ecd
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_10                          0x00000ece
+
  #define REG_A4XX_SP_PERFCTR_SP_SEL_11                          0x00000ecf
  
  #define REG_A4XX_SP_SP_CTRL_REG                                        0x000022c0
@@ -2180,6 +2219,7 @@ static inline uint32_t A4XX_GRAS_SU_POINT_SIZE(float val)
  
  #define REG_A4XX_GRAS_ALPHA_CONTROL                            0x00002073
  #define A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE              0x00000004
+#define A4XX_GRAS_ALPHA_CONTROL_FORCE_FRAGZ_TO_FS              0x00000008
  
  #define REG_A4XX_GRAS_SU_POLY_OFFSET_SCALE                     0x00002074
  #define A4XX_GRAS_SU_POLY_OFFSET_SCALE__MASK                   0xffffffff
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c

index e53e0c5..7d6365b 100644 (file)
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -171,8 +171,8 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
  
         fd4_query_context_init(pctx);
  
-       fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096,
-                       2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+       fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0,
+                                                         PIPE_USAGE_STREAM);
  
         return pctx;
  }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c

index f220fc7..bc62a5d 100644 (file)
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -133,7 +133,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
         void *ptr;
  
         u_upload_alloc(fd4_ctx->border_color_uploader,
-                       0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+                       0, BORDER_COLOR_UPLOAD_SIZE,
+                      BORDER_COLOR_UPLOAD_SIZE, &off,
                         &fd4_ctx->border_color_buf,
                         &ptr);
  
@@ -529,14 +530,16 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
  
                 OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
                 OUT_RING(ring, zsa->rb_depth_control |
-                               COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE));
+                               COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE) |
+                               COND(fragz && fp->frag_coord, A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS));
  
                 /* maybe this register/bitfield needs a better name.. this
                  * appears to be just disabling early-z
                  */
                 OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
                 OUT_RING(ring, zsa->gras_alpha_control |
-                               COND(fragz, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE));
+                               COND(fragz, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE) |
+                               COND(fragz && fp->frag_coord, A4XX_GRAS_ALPHA_CONTROL_FORCE_FRAGZ_TO_FS));
         }
  
         if (dirty & FD_DIRTY_RASTERIZER) {
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c

index 0e861b9..32b8fce 100644 (file)
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -420,9 +420,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
                         COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) |
                         COND(s[FS].v->frag_coord, A4XX_RB_RENDER_CONTROL2_XCOORD |
                                         A4XX_RB_RENDER_CONTROL2_YCOORD |
-// TODO enabling gl_FragCoord.z is causing lockups on 0ad (but seems
-// to work everywhere else).
-//                                     A4XX_RB_RENDER_CONTROL2_ZCOORD |
+                                       A4XX_RB_RENDER_CONTROL2_ZCOORD |
                                         A4XX_RB_RENDER_CONTROL2_WCOORD));
  
         OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h

index 0e0f0e6..f9c0e6a 100644 (file)
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
  - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
  - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
  
  Copyright (C) 2013-2015 by the following authors:
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h

index 4aabc08..c674189 100644 (file)
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
  - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
  - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
  - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
  
  Copyright (C) 2013-2015 by the following authors:
@@ -199,7 +199,11 @@ enum adreno_state_type {
  
  enum adreno_state_src {
         SS_DIRECT = 0,
+       SS_INVALID_ALL_IC = 2,
+       SS_INVALID_PART_IC = 3,
         SS_INDIRECT = 4,
+       SS_INDIRECT_TCM = 5,
+       SS_INDIRECT_STM = 6,
  };
  
  enum a4xx_index_size {
@@ -227,7 +231,7 @@ static inline uint32_t CP_LOAD_STATE_0_STATE_BLOCK(enum adreno_state_block val)
  {
         return ((val) << CP_LOAD_STATE_0_STATE_BLOCK__SHIFT) & CP_LOAD_STATE_0_STATE_BLOCK__MASK;
  }
-#define CP_LOAD_STATE_0_NUM_UNIT__MASK                         0x7fc00000
+#define CP_LOAD_STATE_0_NUM_UNIT__MASK                         0xffc00000
  #define CP_LOAD_STATE_0_NUM_UNIT__SHIFT                                22
  static inline uint32_t CP_LOAD_STATE_0_NUM_UNIT(uint32_t val)
  {
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h

index 571c814..418b71b 100644 (file)
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -40,6 +40,8 @@
  #include "freedreno_gmem.h"
  #include "freedreno_util.h"
  
+#define BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE)
+
  struct fd_vertex_stateobj;
  
  struct fd_texture_stateobj {
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c

index 5bbe401..a75b04b 100644 (file)
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -226,6 +226,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
         case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
         case PIPE_CAP_DRAW_INDIRECT:
+       case PIPE_CAP_MULTI_DRAW_INDIRECT:
+       case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
         case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
         case PIPE_CAP_POLYGON_OFFSET_CLAMP:
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
@@ -238,6 +240,13 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_SHAREABLE_SHADERS:
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
         case PIPE_CAP_CLEAR_TEXTURE:
+       case PIPE_CAP_DRAW_PARAMETERS:
+       case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+       case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+       case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+       case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+       case PIPE_CAP_INVALIDATE_BUFFER:
+       case PIPE_CAP_GENERATE_MIPMAP:
                 return 0;
  
         case PIPE_CAP_MAX_VIEWPORTS:
@@ -414,6 +423,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                 return PIPE_SHADER_IR_TGSI;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
                 return 32;
+       case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+               return 0;
         }
         debug_printf("unknown shader param %d\n", param);
         return 0;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c

index e768e61..481859e 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -40,6 +40,7 @@
  #include "freedreno_util.h"
  
  #include "ir3_compiler.h"
+#include "ir3_nir.h"
  #include "instr-a3xx.h"
  #include "ir3.h"
  
@@ -105,17 +106,15 @@ int main(int argc, char **argv)
         const char *filename;
         struct tgsi_token toks[65536];
         struct tgsi_parse_context parse;
-       struct ir3_compiler *compiler;
         struct ir3_shader_variant v;
         struct ir3_shader s;
         struct ir3_shader_key key = {};
+       /* TODO cmdline option to target different gpus: */
         unsigned gpu_id = 320;
         const char *info;
         void *ptr;
         size_t size;
  
-       fd_mesa_debug |= FD_DBG_DISASM;
-
         memset(&s, 0, sizeof(s));
         memset(&v, 0, sizeof(v));
  
@@ -128,7 +127,7 @@ int main(int argc, char **argv)
  
         while (n < argc) {
                 if (!strcmp(argv[n], "--verbose")) {
-                       fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS;
+                       fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS | FD_DBG_DISASM;
                         n++;
                         continue;
                 }
@@ -230,7 +229,12 @@ int main(int argc, char **argv)
         if (!tgsi_text_translate(ptr, toks, Elements(toks)))
                 errx(1, "could not parse `%s'", filename);
  
-       s.tokens = toks;
+       if (fd_mesa_debug & FD_DBG_OPTMSGS)
+               tgsi_dump(toks, 0);
+
+       nir_shader *nir = ir3_tgsi_to_nir(toks);
+       s.compiler = ir3_compiler_create(gpu_id);
+       s.nir = ir3_optimize_nir(&s, nir, NULL);
  
         v.key = key;
         v.shader = &s;
@@ -248,11 +252,8 @@ int main(int argc, char **argv)
                 break;
         }
  
-       /* TODO cmdline option to target different gpus: */
-       compiler = ir3_compiler_create(gpu_id);
-
         info = "NIR compiler";
-       ret = ir3_compile_shader_nir(compiler, &v);
+       ret = ir3_compile_shader_nir(s.compiler, &v);
         if (ret) {
                 fprintf(stderr, "compiler failed!\n");
                 return ret;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c

index eea5c5e..86afda4 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -32,10 +32,6 @@
  #include "util/u_string.h"
  #include "util/u_memory.h"
  #include "util/u_inlines.h"
-#include "tgsi/tgsi_lowering.h"
-#include "tgsi/tgsi_strings.h"
-
-#include "nir/tgsi_to_nir.h"
  
  #include "freedreno_util.h"
  
@@ -123,97 +119,10 @@ struct ir3_compile {
  static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
  static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
  
-static struct nir_shader *to_nir(struct ir3_compile *ctx,
-               const struct tgsi_token *tokens, struct ir3_shader_variant *so)
-{
-       static const nir_shader_compiler_options options = {
-                       .lower_fpow = true,
-                       .lower_fsat = true,
-                       .lower_scmp = true,
-                       .lower_flrp = true,
-                       .lower_ffract = true,
-                       .native_integers = true,
-       };
-       struct nir_lower_tex_options tex_options = {
-                       .lower_rect = 0,
-       };
-       bool progress;
-
-       switch (so->type) {
-       case SHADER_FRAGMENT:
-       case SHADER_COMPUTE:
-               tex_options.saturate_s = so->key.fsaturate_s;
-               tex_options.saturate_t = so->key.fsaturate_t;
-               tex_options.saturate_r = so->key.fsaturate_r;
-               break;
-       case SHADER_VERTEX:
-               tex_options.saturate_s = so->key.vsaturate_s;
-               tex_options.saturate_t = so->key.vsaturate_t;
-               tex_options.saturate_r = so->key.vsaturate_r;
-               break;
-       }
-
-       if (ctx->compiler->gpu_id >= 400) {
-               /* a4xx seems to have *no* sam.p */
-               tex_options.lower_txp = ~0;  /* lower all txp */
-       } else {
-               /* a3xx just needs to avoid sam.p for 3d tex */
-               tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
-       }
-
-       struct nir_shader *s = tgsi_to_nir(tokens, &options);
-
-       if (fd_mesa_debug & FD_DBG_DISASM) {
-               debug_printf("----------------------\n");
-               nir_print_shader(s, stdout);
-               debug_printf("----------------------\n");
-       }
-
-       nir_opt_global_to_local(s);
-       nir_convert_to_ssa(s);
-       if (s->stage == MESA_SHADER_VERTEX) {
-               nir_lower_clip_vs(s, so->key.ucp_enables);
-       } else if (s->stage == MESA_SHADER_FRAGMENT) {
-               nir_lower_clip_fs(s, so->key.ucp_enables);
-       }
-       nir_lower_tex(s, &tex_options);
-       if (so->key.color_two_side)
-               nir_lower_two_sided_color(s);
-       nir_lower_idiv(s);
-       nir_lower_load_const_to_scalar(s);
-
-       do {
-               progress = false;
-
-               nir_lower_vars_to_ssa(s);
-               nir_lower_alu_to_scalar(s);
-               nir_lower_phis_to_scalar(s);
-
-               progress |= nir_copy_prop(s);
-               progress |= nir_opt_dce(s);
-               progress |= nir_opt_cse(s);
-               progress |= ir3_nir_lower_if_else(s);
-               progress |= nir_opt_algebraic(s);
-               progress |= nir_opt_constant_folding(s);
-
-       } while (progress);
-
-       nir_remove_dead_variables(s);
-       nir_validate_shader(s);
-
-       if (fd_mesa_debug & FD_DBG_DISASM) {
-               debug_printf("----------------------\n");
-               nir_print_shader(s, stdout);
-               debug_printf("----------------------\n");
-       }
-
-       return s;
-}
  
  static struct ir3_compile *
  compile_init(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *so,
-               const struct tgsi_token *tokens)
+               struct ir3_shader_variant *so)
  {
         struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
  
@@ -239,7 +148,28 @@ compile_init(struct ir3_compiler *compiler,
         ctx->block_ht = _mesa_hash_table_create(ctx,
                         _mesa_hash_pointer, _mesa_key_pointer_equal);
  
-       ctx->s = to_nir(ctx, tokens, so);
+       /* TODO: maybe generate some sort of bitmask of what key
+        * lowers vs what shader has (ie. no need to lower
+        * texture clamp lowering if no texture sample instrs)..
+        * although should be done further up the stack to avoid
+        * creating duplicate variants..
+        */
+
+       if (ir3_key_lowers_nir(&so->key)) {
+               nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+               ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+       } else {
+               /* fast-path for shader key that lowers nothing in NIR: */
+               ctx->s = so->shader->nir;
+       }
+
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}",
+                       so->shader->id, so->id, so->type,
+                       so->key.binning_pass, so->key.color_two_side,
+                       so->key.half_precision);
+               nir_print_shader(ctx->s, stdout);
+       }
  
         so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
  
@@ -1264,7 +1194,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
  
  /* handles array reads: */
  static void
-emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
+emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
                 struct ir3_instruction **dst)
  {
         nir_deref_var *dvar = intr->variables[0];
@@ -1305,7 +1235,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
  
  /* handles array writes: */
  static void
-emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
  {
         nir_deref_var *dvar = intr->variables[0];
         nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
@@ -1321,6 +1251,10 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
         case nir_deref_array_type_direct:
                 /* direct access does not require anything special: */
                 for (int i = 0; i < intr->num_components; i++) {
+                       /* ttn doesn't generate partial writemasks */
+                       assert(intr->const_index[0] ==
+                              (1 << intr->num_components) - 1);
+
                         unsigned n = darr->base_offset * 4 + i;
                         compile_assert(ctx, n < arr->length);
                         arr->arr[n] = src[i];
@@ -1333,6 +1267,10 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
                 struct ir3_instruction *addr =
                                 get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
                 for (int i = 0; i < intr->num_components; i++) {
+                       /* ttn doesn't generate partial writemasks */
+                       assert(intr->const_index[0] ==
+                              (1 << intr->num_components) - 1);
+
                         struct ir3_instruction *store;
                         unsigned n = darr->base_offset * 4 + i;
                         compile_assert(ctx, n < arr->length);
@@ -1392,7 +1330,7 @@ static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
  }
  
  static void
-emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
  {
         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
         struct ir3_instruction **dst, **src;
@@ -1454,10 +1392,10 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
                 }
                 break;
         case nir_intrinsic_load_var:
-               emit_intrinisic_load_var(ctx, intr, dst);
+               emit_intrinsic_load_var(ctx, intr, dst);
                 break;
         case nir_intrinsic_store_var:
-               emit_intrinisic_store_var(ctx, intr);
+               emit_intrinsic_store_var(ctx, intr);
                 break;
         case nir_intrinsic_store_output:
                 const_offset = nir_src_as_const_value(intr->src[1]);
@@ -1927,7 +1865,7 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
                 emit_alu(ctx, nir_instr_as_alu(instr));
                 break;
         case nir_instr_type_intrinsic:
-               emit_intrinisic(ctx, nir_instr_as_intrinsic(instr));
+               emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
                 break;
         case nir_instr_type_load_const:
                 emit_load_const(ctx, nir_instr_as_load_const(instr));
@@ -1946,8 +1884,6 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
                 case nir_texop_query_levels:
                         emit_tex_query_levels(ctx, tex);
                         break;
-               case nir_texop_samples_identical:
-                       unreachable("nir_texop_samples_identical");
                 default:
                         emit_tex(ctx, tex);
                         break;
@@ -2162,6 +2098,8 @@ emit_stream_out(struct ir3_compile *ctx)
  static void
  emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
  {
+       nir_metadata_require(impl, nir_metadata_block_index);
+
         emit_cf_list(ctx, &impl->body);
         emit_block(ctx, impl->end_block);
  
@@ -2343,10 +2281,10 @@ emit_instructions(struct ir3_compile *ctx)
         nir_function_impl *fxn = NULL;
  
         /* Find the main function: */
-       nir_foreach_overload(ctx->s, overload) {
-               compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
-               compile_assert(ctx, overload->impl);
-               fxn = overload->impl;
+       nir_foreach_function(ctx->s, function) {
+               compile_assert(ctx, strcmp(function->name, "main") == 0);
+               compile_assert(ctx, function->impl);
+               fxn = function->impl;
                 break;
         }
  
@@ -2491,7 +2429,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
  
         assert(!so->ir);
  
-       ctx = compile_init(compiler, so, so->shader->tokens);
+       ctx = compile_init(compiler, so);
         if (!ctx) {
                 DBG("INIT failed!");
                 ret = -1;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c

new file mode 100644 (file)

index 0000000..565b9c3
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,153 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+
+#include "freedreno_util.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "nir/tgsi_to_nir.h"
+
+struct nir_shader *
+ir3_tgsi_to_nir(const struct tgsi_token *tokens)
+{
+       static const nir_shader_compiler_options options = {
+                       .lower_fpow = true,
+                       .lower_fsat = true,
+                       .lower_scmp = true,
+                       .lower_flrp = true,
+                       .lower_ffract = true,
+                       .native_integers = true,
+       };
+       return tgsi_to_nir(tokens, &options);
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+       return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+                       key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+                       key->ucp_enables | key->color_two_side;
+}
+
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+               const struct ir3_shader_key *key)
+{
+       struct nir_lower_tex_options tex_options = {
+                       .lower_rect = 0,
+       };
+       bool progress;
+
+       if (key) {
+               switch (shader->type) {
+               case SHADER_FRAGMENT:
+               case SHADER_COMPUTE:
+                       tex_options.saturate_s = key->fsaturate_s;
+                       tex_options.saturate_t = key->fsaturate_t;
+                       tex_options.saturate_r = key->fsaturate_r;
+                       break;
+               case SHADER_VERTEX:
+                       tex_options.saturate_s = key->vsaturate_s;
+                       tex_options.saturate_t = key->vsaturate_t;
+                       tex_options.saturate_r = key->vsaturate_r;
+                       break;
+               }
+       }
+
+       if (shader->compiler->gpu_id >= 400) {
+               /* a4xx seems to have *no* sam.p */
+               tex_options.lower_txp = ~0;  /* lower all txp */
+       } else {
+               /* a3xx just needs to avoid sam.p for 3d tex */
+               tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+       }
+
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               debug_printf("----------------------\n");
+               nir_print_shader(s, stdout);
+               debug_printf("----------------------\n");
+       }
+
+       OPT_V(s, nir_opt_global_to_local);
+       OPT_V(s, nir_convert_to_ssa);
+
+       if (key) {
+               if (s->stage == MESA_SHADER_VERTEX) {
+                       OPT_V(s, nir_lower_clip_vs, key->ucp_enables);
+               } else if (s->stage == MESA_SHADER_FRAGMENT) {
+                       OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
+               }
+               if (key->color_two_side) {
+                       OPT_V(s, nir_lower_two_sided_color);
+               }
+       }
+
+       OPT_V(s, nir_lower_tex, &tex_options);
+       OPT_V(s, nir_lower_idiv);
+       OPT_V(s, nir_lower_load_const_to_scalar);
+
+       do {
+               progress = false;
+
+               OPT_V(s, nir_lower_vars_to_ssa);
+               OPT_V(s, nir_lower_alu_to_scalar);
+               OPT_V(s, nir_lower_phis_to_scalar);
+
+               progress |= OPT(s, nir_copy_prop);
+               progress |= OPT(s, nir_opt_dce);
+               progress |= OPT(s, nir_opt_cse);
+               progress |= OPT(s, ir3_nir_lower_if_else);
+               progress |= OPT(s, nir_opt_algebraic);
+               progress |= OPT(s, nir_opt_constant_folding);
+
+       } while (progress);
+
+       OPT_V(s, nir_remove_dead_variables);
+
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               debug_printf("----------------------\n");
+               nir_print_shader(s, stdout);
+               debug_printf("----------------------\n");
+       }
+
+       nir_sweep(s);
+
+       return s;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h

index 9950782..534199d 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
@@ -32,6 +32,13 @@
  #include "glsl/nir/nir.h"
  #include "glsl/nir/shader_enums.h"
  
+#include "ir3_shader.h"
+
  bool ir3_nir_lower_if_else(nir_shader *shader);
  
+struct nir_shader * ir3_tgsi_to_nir(const struct tgsi_token *tokens);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+               const struct ir3_shader_key *key);
+
  #endif /* IR3_NIR_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c

index 4ec0e2b..6eee2eb 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -328,9 +328,9 @@ ir3_nir_lower_if_else(nir_shader *shader)
  {
         bool progress = false;
  
-       nir_foreach_overload(shader, overload) {
-               if (overload->impl)
-                       progress |= lower_if_else_impl(overload->impl);
+       nir_foreach_function(shader, function) {
+               if (function->impl)
+                       progress |= lower_if_else_impl(function->impl);
         }
  
         return progress;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c

index 07e03d2..a84e798 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -143,7 +143,7 @@ block_id(struct ir3_block *block)
  #ifdef DEBUG
         return block->serialno;
  #else
-       return (uint32_t)(uint64_t)block;
+       return (uint32_t)(unsigned long)block;
  #endif
  }
  
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c

index 7b56533..7d17f42 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -39,7 +39,7 @@
  
  #include "ir3_shader.h"
  #include "ir3_compiler.h"
-
+#include "ir3_nir.h"
  
  static void
  delete_variant(struct ir3_shader_variant *v)
@@ -187,12 +187,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
         v->key = key;
         v->type = shader->type;
  
-       if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
-                       key.binning_pass, key.color_two_side, key.half_precision);
-               tgsi_dump(shader->tokens, 0);
-       }
-
         ret = ir3_compile_shader_nir(shader->compiler, v);
         if (ret) {
                 debug_error("compile failed!");
@@ -267,7 +261,7 @@ ir3_shader_destroy(struct ir3_shader *shader)
                 v = v->next;
                 delete_variant(t);
         }
-       free((void *)shader->tokens);
+       ralloc_free(shader->nir);
         free(shader);
  }
  
@@ -281,14 +275,24 @@ ir3_shader_create(struct pipe_context *pctx,
         shader->id = ++shader->compiler->shader_count;
         shader->pctx = pctx;
         shader->type = type;
-       shader->tokens = tgsi_dup_tokens(cso->tokens);
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               DBG("dump tgsi: type=%d", shader->type);
+               tgsi_dump(cso->tokens, 0);
+       }
+       nir_shader *nir = ir3_tgsi_to_nir(cso->tokens);
+       /* do first pass optimization, ignoring the key: */
+       shader->nir = ir3_optimize_nir(shader, nir, NULL);
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               DBG("dump nir%d: type=%d", shader->id, shader->type);
+               nir_print_shader(shader->nir, stdout);
+       }
         shader->stream_output = cso->stream_output;
         if (fd_mesa_debug & FD_DBG_SHADERDB) {
                 /* if shader-db run, create a standard variant immediately
                  * (as otherwise nothing will trigger the shader to be
                  * actually compiled)
                  */
-               static struct ir3_shader_key key = {};
+               static struct ir3_shader_key key = {0};
                 ir3_shader_variant(shader, key);
         }
         return shader;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h

index cf99a4c..b3c28a4 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -230,6 +230,8 @@ struct ir3_shader_variant {
         struct ir3_shader *shader;
  };
  
+typedef struct nir_shader nir_shader;
+
  struct ir3_shader {
         enum shader_t type;
  
@@ -240,7 +242,7 @@ struct ir3_shader {
         struct ir3_compiler *compiler;
  
         struct pipe_context *pctx;    /* TODO replace w/ pipe_screen */
-       const struct tgsi_token *tokens;
+       nir_shader *nir;
         struct pipe_stream_output_info stream_output;
  
         struct ir3_shader_variant *variants;
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h

index 1ed6851..2adaee3 100644 (file)
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -195,7 +195,6 @@ struct i915_rasterizer_state {
  
     unsigned light_twoside : 1;
     unsigned st;
-   enum interp_mode color_interp;
  
     unsigned LIS4;
     unsigned LIS7;
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c

index a5b1618..ede5558 100644 (file)
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -254,6 +254,13 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
     case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
        return 0;
  
     case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -264,6 +271,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
     case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
     case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
     case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
     case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
     case PIPE_CAP_SAMPLER_VIEW_TARGET:
        return 0;
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c

index 6ba9646..b54a9fb 100644 (file)
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -423,7 +423,7 @@ i915_prepare_vertex_sampling(struct i915_context *i915)
           for (j = view->u.tex.first_level; j <= tex->last_level; j++) {
              mip_offsets[j] = i915_texture_offset(i915_tex, j , 0 /* FIXME depth */);
              row_stride[j] = i915_tex->stride;
-            img_stride[j] = 0; /* FIXME */;
+            img_stride[j] = 0; /* FIXME */
           }
  
           draw_set_mapped_texture(i915->draw,
@@ -920,7 +920,6 @@ i915_create_rasterizer_state(struct pipe_context *pipe,
     struct i915_rasterizer_state *cso = CALLOC_STRUCT( i915_rasterizer_state );
  
     cso->templ = *rasterizer;
-   cso->color_interp = rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
     cso->light_twoside = rasterizer->light_twoside;
     cso->ds[0].u = _3DSTATE_DEPTH_OFFSET_SCALE;
     cso->ds[1].f = rasterizer->offset_scale;
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c

index 7ad88a1..bd0f448 100644 (file)
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -57,7 +57,6 @@ static uint find_mapping(const struct i915_fragment_shader* fs, int unit)
  static void calculate_vertex_layout(struct i915_context *i915)
  {
     const struct i915_fragment_shader *fs = i915->fs;
-   const enum interp_mode colorInterp = i915->rasterizer->color_interp;
     struct vertex_info vinfo;
     boolean texCoords[I915_TEX_UNITS], colors[2], fog, needW, face;
     uint i;
@@ -107,12 +106,12 @@ static void calculate_vertex_layout(struct i915_context *i915)
     /* pos */
     src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
     if (needW) {
-      draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
        vinfo.hwfmt[0] |= S4_VFMT_XYZW;
        vinfo.attrib[0].emit = EMIT_4F;
     }
     else {
-      draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_3F, src);
        vinfo.hwfmt[0] |= S4_VFMT_XYZ;
        vinfo.attrib[0].emit = EMIT_3F;
     }
@@ -123,21 +122,21 @@ static void calculate_vertex_layout(struct i915_context *i915)
     /* primary color */
     if (colors[0]) {
        src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src);
        vinfo.hwfmt[0] |= S4_VFMT_COLOR;
     }
  
     /* secondary color */
     if (colors[1]) {
        src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
-      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, src);
        vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
     }
  
     /* fog coord, not fog blend factor */
     if (fog) {
        src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
        vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
     }
  
@@ -147,7 +146,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
        if (texCoords[i]) {
           hwtc = TEXCOORDFMT_4D;
           src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC, fs->generic_mapping[i]);
-         draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
        }
        else {
           hwtc = TEXCOORDFMT_NOT_PRESENT;
@@ -164,7 +163,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
         * module by adding an extra shader output.
         */
        src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FACE, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_CONSTANT, src);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
        vinfo.hwfmt[1] &= ~(TEXCOORDFMT_NOT_PRESENT << (slot * 4));
        vinfo.hwfmt[1] |= TEXCOORDFMT_1D << (slot * 4);
     }
@@ -185,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
  struct i915_tracked_state i915_update_vertex_layout = {
     "vertex_layout",
     calculate_vertex_layout,
-   I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS
+   I915_NEW_FS | I915_NEW_VS
  };
  
  
diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c

index 9d51951..079872f 100644 (file)
--- a/src/gallium/drivers/ilo/core/ilo_builder.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder.c
@@ -333,7 +333,7 @@ ilo_builder_init(struct ilo_builder *builder,
                   const struct ilo_dev *dev,
                   struct intel_winsys *winsys)
  {
-   int i;
+   unsigned i;
  
     assert(ilo_is_zeroed(builder, sizeof(*builder)));
  
@@ -366,7 +366,7 @@ ilo_builder_init(struct ilo_builder *builder,
  void
  ilo_builder_reset(struct ilo_builder *builder)
  {
-   int i;
+   unsigned i;
  
     for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++)
        ilo_builder_writer_reset(builder, i);
@@ -382,7 +382,7 @@ ilo_builder_reset(struct ilo_builder *builder)
  bool
  ilo_builder_begin(struct ilo_builder *builder)
  {
-   int i;
+   unsigned i;
  
     for (i = 0; i < ILO_BUILDER_WRITER_COUNT; i++) {
        if (!ilo_builder_writer_alloc_and_map(builder, i)) {
@@ -407,7 +407,7 @@ struct intel_bo *
  ilo_builder_end(struct ilo_builder *builder, unsigned *used)
  {
     struct ilo_builder_writer *bat;
-   int i;
+   unsigned i;
  
     ilo_builder_batch_patch_sba(builder);
  
diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c

index 2a00cf1..6bcd0bc 100644 (file)
--- a/src/gallium/drivers/ilo/ilo_context.c
+++ b/src/gallium/drivers/ilo/ilo_context.c
@@ -189,8 +189,9 @@ ilo_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
      * These must be called last as u_upload/u_blitter are clients of the pipe
      * context.
      */
-   ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024, 16,
-         PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER);
+   ilo->uploader = u_upload_create(&ilo->base, 1024 * 1024,
+         PIPE_BIND_CONSTANT_BUFFER | PIPE_BIND_INDEX_BUFFER,
+                                   PIPE_USAGE_STREAM);
     if (!ilo->uploader) {
        ilo_context_destroy(&ilo->base);
        return NULL;
diff --git a/src/gallium/drivers/ilo/ilo_gpgpu.c b/src/gallium/drivers/ilo/ilo_gpgpu.c

index 9a2ca00..b741590 100644 (file)
--- a/src/gallium/drivers/ilo/ilo_gpgpu.c
+++ b/src/gallium/drivers/ilo/ilo_gpgpu.c
@@ -92,7 +92,7 @@ ilo_launch_grid(struct pipe_context *pipe,
     input_buf.buffer_size =
        ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_INPUT_SIZE);
     if (input_buf.buffer_size) {
-      u_upload_data(ilo->uploader, 0, input_buf.buffer_size, input,
+      u_upload_data(ilo->uploader, 0, input_buf.buffer_size, 16, input,
              &input_buf.buffer_offset, &input_buf.buffer);
     }
  
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c

index cfa2fb4..fa32757 100644 (file)
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -463,6 +463,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
     case PIPE_CAP_MAX_VERTEX_STREAMS:
     case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
     case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
     case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
     case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -476,6 +478,13 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
     case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
        return 0;
  
     case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c

index d89765a..8dc2d38 100644 (file)
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -376,7 +376,7 @@ finalize_cbuf_state(struct ilo_context *ilo,
        if (cbuf->cso[i].resource)
           continue;
  
-      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
+      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size, 16,
              cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
  
        cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource);
@@ -426,12 +426,12 @@ finalize_index_buffer(struct ilo_context *ilo)
        unsigned hw_offset;
  
        if (vec->ib.state.user_buffer) {
-         u_upload_data(ilo->uploader, 0, size,
+         u_upload_data(ilo->uploader, 0, size, 16,
                 vec->ib.state.user_buffer + offset,
                 &hw_offset, &vec->ib.hw_resource);
        } else {
           u_upload_buffer(ilo->uploader, 0,
-               vec->ib.state.offset + offset, size, vec->ib.state.buffer,
+               vec->ib.state.offset + offset, size, 16, vec->ib.state.buffer,
                 &hw_offset, &vec->ib.hw_resource);
        }
  
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c

index 5250115..f46126e 100644 (file)
--- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
@@ -266,7 +266,7 @@ fs_lower_opcode_tgsi_indirect_const(struct fs_compile_context *fcc,
     struct toy_inst *inst;
     struct toy_src desc, real_src[4];
     struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
  
     tsrc_transpose(idx, real_src);
  
@@ -319,7 +319,7 @@ fs_lower_opcode_tgsi_const_pcb(struct fs_compile_context *fcc,
     const int grf_subreg = (idx.val32 & 1) * 16;
     struct toy_src src;
     struct toy_dst real_dst[4];
-   int i;
+   unsigned i;
  
     if (!fcc->variant->use_pcb || dim != 0 || idx.file != TOY_FILE_IMM ||
         grf >= fcc->first_attr_grf)
@@ -350,7 +350,7 @@ fs_lower_opcode_tgsi_const_gen6(struct fs_compile_context *fcc,
     struct toy_inst *inst;
     struct toy_src desc;
     struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
  
     if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
        return;
@@ -396,7 +396,7 @@ fs_lower_opcode_tgsi_const_gen7(struct fs_compile_context *fcc,
     struct toy_src desc;
     struct toy_inst *inst;
     struct toy_dst tmp, real_dst[4];
-   int i;
+   unsigned i;
  
     if (fs_lower_opcode_tgsi_const_pcb(fcc, dst, dim, idx))
        return;
@@ -1168,7 +1168,7 @@ fs_lower_opcode_derivative(struct toy_compiler *tc, struct toy_inst *inst)
  {
     struct toy_dst dst[4];
     struct toy_src src[4];
-   int i;
+   unsigned i;
  
     tdst_transpose(inst->dst, dst);
     tsrc_transpose(inst->src[0], src);
@@ -1257,7 +1257,7 @@ fs_lower_opcode_kil(struct toy_compiler *tc, struct toy_inst *inst)
     }
     else {
        struct toy_src src[4];
-      int i;
+      unsigned i;
  
        tsrc_transpose(inst->src[0], src);
        /* mask out killed pixels */
@@ -1583,7 +1583,7 @@ fs_write_fb(struct fs_compile_context *fcc)
  static void
  fs_setup_shader_out(struct ilo_shader *sh, const struct toy_tgsi *tgsi)
  {
-   int i;
+   unsigned i;
  
     sh->out.count = tgsi->num_outputs;
     for (i = 0; i < tgsi->num_outputs; i++) {
@@ -1603,7 +1603,7 @@ static void
  fs_setup_shader_in(struct ilo_shader *sh, const struct toy_tgsi *tgsi,
                     bool flatshade)
  {
-   int i;
+   unsigned i;
  
     sh->in.count = tgsi->num_inputs;
     for (i = 0; i < tgsi->num_inputs; i++) {
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c

index a29baab..0df0afc 100644 (file)
--- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
@@ -126,7 +126,7 @@ vs_lower_opcode_tgsi_const_gen6(struct vs_compile_context *vcc,
     tc_MOV(tc, block_offsets, idx);
  
     msg_type = GEN6_MSG_DP_OWORD_DUAL_BLOCK_READ;
-   msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;;
+   msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1;
     msg_len = 2;
  
     desc = tsrc_imm_mdesc_data_port(tc, false, msg_len, 1, true, false,
@@ -522,7 +522,7 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc,
        if (num_coords >= 3) {
           struct toy_dst tmp, max;
           struct toy_src abs_coords[3];
-         int i;
+         unsigned i;
  
           tmp = tc_alloc_tmp(tc);
           max = tdst_writemask(tmp, TOY_WRITEMASK_W);
@@ -804,7 +804,7 @@ static int
  vs_collect_outputs(struct vs_compile_context *vcc, struct toy_src *outs)
  {
     const struct toy_tgsi *tgsi = &vcc->tgsi;
-   int i;
+   unsigned i;
  
     for (i = 0; i < vcc->shader->out.count; i++) {
        const int slot = vcc->output_map[i];
diff --git a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c

index b725375..1874faa 100644 (file)
--- a/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
+++ b/src/gallium/drivers/ilo/shader/toy_legalize_ra.c
@@ -70,7 +70,7 @@ struct linear_scan {
  static void
  linear_scan_free_regs(struct linear_scan *ls, int reg, int count)
  {
-   int i;
+   unsigned i;
  
     for (i = 0; i < count; i++)
        ls->free_regs[ls->num_free_regs++] = reg + count - 1 - i;
@@ -221,7 +221,7 @@ linear_scan_spill(struct linear_scan *ls,
  static void
  linear_scan_spill_range(struct linear_scan *ls, int first, int count)
  {
-   int i;
+   unsigned i;
  
     for (i = 0; i < count; i++) {
        struct linear_scan_live_interval *interval = &ls->intervals[first + i];
diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c

index d38585f..9a7140b 100644 (file)
--- a/src/gallium/drivers/ilo/shader/toy_tgsi.c
+++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c
@@ -1593,7 +1593,7 @@ ra_get_type(struct toy_tgsi *tgsi, const struct tgsi_full_instruction *tgsi_inst
        tgsi_inst->Src[operand].Register.File;
     switch (file) {
     case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
     case TGSI_FILE_SAMPLER_VIEW:
        type = TOY_TYPE_D;
        break;
@@ -1834,7 +1834,7 @@ ra_get_src_indirect(struct toy_tgsi *tgsi,
        src = tsrc_null();
        break;
     case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
     case TGSI_FILE_SAMPLER_VIEW:
        is_resource = true;
        /* fall through */
@@ -1918,7 +1918,7 @@ ra_get_src(struct toy_tgsi *tgsi,
        need_vrf = true;
        break;
     case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
     case TGSI_FILE_SAMPLER_VIEW:
        assert(!s->Register.Dimension);
        src = tsrc_imm_d(s->Register.Index);
@@ -2256,7 +2256,7 @@ parse_declaration(struct toy_tgsi *tgsi,
     case TGSI_FILE_SAMPLER:
     case TGSI_FILE_PREDICATE:
     case TGSI_FILE_ADDRESS:
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_IMAGE:
     case TGSI_FILE_SAMPLER_VIEW:
        /* nothing to do */
        break;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h

index 0a52642..9029d2a 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -63,8 +63,7 @@ enum lp_interp {
     LP_INTERP_LINEAR,
     LP_INTERP_PERSPECTIVE,
     LP_INTERP_POSITION,
-   LP_INTERP_FACING,
-   LP_INTERP_ZERO
+   LP_INTERP_FACING
  };
  
  struct lp_shader_input {
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h

index 9dcc102..62d99bb 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -108,28 +108,22 @@ struct llvmpipe_context {
     struct vertex_info vertex_info;
     
     /** Which vertex shader output slot contains color */
-   uint8_t color_slot[2];
+   int8_t color_slot[2];
  
     /** Which vertex shader output slot contains bcolor */
-   uint8_t bcolor_slot[2];
+   int8_t bcolor_slot[2];
  
     /** Which vertex shader output slot contains point size */
-   uint8_t psize_slot;
+   int8_t psize_slot;
  
     /** Which vertex shader output slot contains viewport index */
-   uint8_t viewport_index_slot;
+   int8_t viewport_index_slot;
  
     /** Which geometry shader output slot contains layer */
-   uint8_t layer_slot;
+   int8_t layer_slot;
  
     /** A fake frontface output for unfilled primitives */
-   uint8_t face_slot;
-
-   /** Which output slot is used for the fake vp index info */
-   uint8_t fake_vpindex_slot;
-
-   /** Which output slot is used for the fake layer info */
-   uint8_t fake_layer_slot;
+   int8_t face_slot;
  
     /** Depth format and bias settings. */
     boolean floating_point_depth;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h

index c19f931..db45cbb 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -115,7 +115,7 @@ struct lp_rast_plane {
     int32_t dcdy;
  
     /* one-pixel sized trivial reject offsets for each plane */
-   int64_t eo;
+   uint32_t eo;
  };
  
  /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c

index c9b9221..0ae6ec2 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -64,43 +64,43 @@ block_full_16(struct lp_rasterizer_task *task,
  }
  
  static inline unsigned
-build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
+build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)
  {
     unsigned mask = 0;
  
-   int64_t c0 = c;
-   int64_t c1 = c0 + dcdy;
-   int64_t c2 = c1 + dcdy;
-   int64_t c3 = c2 + dcdy;
-
-   mask |= ((c0 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 0);
-   mask |= ((c0 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 1);
-   mask |= ((c0 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 2);
-   mask |= ((c0 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 3);
-   mask |= ((c1 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 4);
-   mask |= ((c1 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 5);
-   mask |= ((c1 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 6);
-   mask |= ((c1 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 7);
-   mask |= ((c2 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 8);
-   mask |= ((c2 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 9);
-   mask |= ((c2 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 10);
-   mask |= ((c2 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 11);
-   mask |= ((c3 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 12);
-   mask |= ((c3 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 13);
-   mask |= ((c3 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 14);
-   mask |= ((c3 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 15);
+   int32_t c0 = c;
+   int32_t c1 = c0 + dcdy;
+   int32_t c2 = c1 + dcdy;
+   int32_t c3 = c2 + dcdy;
+
+   mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
+   mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
+   mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
+   mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
+   mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
+   mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
+   mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
+   mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
+   mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
+   mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
+   mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
+   mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
+   mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
+   mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
+   mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
+   mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
    
     return mask;
  }
  
  
  static inline void
-build_masks(int64_t c,
-            int64_t cdiff,
-            int64_t dcdx,
-            int64_t dcdy,
-           unsigned *outmask,
-           unsigned *partmask)
+build_masks(int32_t c,
+            int32_t cdiff,
+            int32_t dcdx,
+            int32_t dcdy,
+            unsigned *outmask,
+            unsigned *partmask)
  {
     *outmask |= build_mask_linear(c, dcdx, dcdy);
     *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
@@ -133,47 +133,19 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
     lp_rast_triangle_4(task, arg2);
  }
  
-#if !defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_SSE)
  
-void
-lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
-                         const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<3)-1;
-   lp_rast_triangle_32_3(task, arg2);
-}
-
-void
-lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
-                         const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<4)-1;
-   lp_rast_triangle_32_4(task, arg2);
-}
-
-void
-lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
-{
-   lp_rast_triangle_32_3_16(task, arg);
-}
-
-#else
  #include <emmintrin.h>
  #include "util/u_sse.h"
  
  
  static inline void
-build_masks_32(int c, 
-               int cdiff,
-               int dcdx,
-               int dcdy,
-               unsigned *outmask,
-               unsigned *partmask)
+build_masks_sse(int c,
+                int cdiff,
+                int dcdx,
+                int dcdy,
+                unsigned *outmask,
+                unsigned *partmask)
  {
     __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
     __m128i xdcdy = _mm_set1_epi32(dcdy);
@@ -214,7 +186,7 @@ build_masks_32(int c,
  
  
  static inline unsigned
-build_mask_linear_32(int c, int dcdx, int dcdy)
+build_mask_linear_sse(int c, int dcdx, int dcdy)
  {
     __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
     __m128i xdcdy = _mm_set1_epi32(dcdy);
@@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
  
  #define NR_PLANES 3
  
-
-
-
-
-
-
  void
  lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                        const union lp_rast_cmd_arg arg)
@@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                                 0xffff & ~out[i].mask);
  }
  
-
-
-
-
  void
  lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
@@ -471,11 +433,269 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
  }
  
  #undef NR_PLANES
+
+#else
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#include <altivec.h>
+#include "util/u_pwr8.h"
+
+static inline void
+build_masks_ppc(int c,
+                int cdiff,
+                int dcdx,
+                int dcdy,
+                unsigned *outmask,
+                unsigned *partmask)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   {
+      __m128i cstep01, cstep23, result;
+
+      cstep01 = vec_packs_epi32(cstep0, cstep1);
+      cstep23 = vec_packs_epi32(cstep2, cstep3);
+      result = vec_packs_epi16(cstep01, cstep23);
+
+      *outmask |= vec_movemask_epi8(result);
+   }
+
+
+   {
+      __m128i cio4 = (__m128i) vec_splats(cdiff);
+      __m128i cstep01, cstep23, result;
+
+      cstep0 = vec_add_epi32(cstep0, cio4);
+      cstep1 = vec_add_epi32(cstep1, cio4);
+      cstep2 = vec_add_epi32(cstep2, cio4);
+      cstep3 = vec_add_epi32(cstep3, cio4);
+
+      cstep01 = vec_packs_epi32(cstep0, cstep1);
+      cstep23 = vec_packs_epi32(cstep2, cstep3);
+      result = vec_packs_epi16(cstep01, cstep23);
+
+      *partmask |= vec_movemask_epi8(result);
+   }
+}
+
+static inline unsigned
+build_mask_linear_ppc(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = vec_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return vec_movemask_epi8(result);
+}
+
+static inline __m128i
+lp_plane_to_m128i(const struct lp_rast_plane *plane)
+{
+   return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
+                         (int32_t)plane->dcdy, (int32_t)plane->eo);
+}
+
+#define NR_PLANES 3
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = vec_splats((unsigned char) 0);
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+
+   __m128i vshuf_mask0;
+   __m128i vshuf_mask1;
+   __m128i vshuf_mask2;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
+#else
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
+#endif
+
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+    */
+   dcdx = vec_sub_epi32(zero, dcdx);
+
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
+   rej4 = vec_slli_epi32(rej4, 2);
+
+   /*
+    * Adjust so we can just check the sign bit (< 0 comparison),
+    * instead of having to do a less efficient <= 0 comparison
+    */
+   c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
+   rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
+
+   dcdx2 = vec_add_epi32(dcdx, dcdx);
+   dcdx3 = vec_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+   for (i = 0; i < 4; i++) {
+      __m128i cx = c;
+
+      for (j = 0; j < 4; j++) {
+         __m128i c4rej = vec_add_epi32(cx, rej4);
+         __m128i rej_masks = vec_srai_epi32(c4rej, 31);
+
+         /* if (is_zero(rej_masks)) */
+         if (vec_movemask_epi8(rej_masks) == 0) {
+            __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
+            __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
+            __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
+
+            __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
+
+            __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
+            __m128i c_01 = vec_packs_epi32(c_0, c_1);
+
+            __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
+
+            __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
+            __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
+            __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
+
+            __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
+            __m128i c_23 = vec_packs_epi32(c_2, c_3);
+            __m128i c_0123 = vec_packs_epi16(c_01, c_23);
+
+            unsigned mask = vec_movemask_epi8(c_0123);
+
+            out[nr].i = i;
+            out[nr].j = j;
+            out[nr].mask = mask;
+            if (mask != 0xffff)
+               nr++;
+         }
+         cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
+      }
+
+      c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
+   }
+
+   for (i = 0; i < nr; i++)
+      lp_rast_shade_quads_mask(task,
+                               &tri->inputs,
+                               x + 4 * out[i].j,
+                               y + 4 * out[i].i,
+                               0xffff & ~out[i].mask);
+}
+
+#undef NR_PLANES
+
+#else
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<3)-1;
+   lp_rast_triangle_32_3(task, arg2);
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_32_4(task, arg2);
+}
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   lp_rast_triangle_32_3_16(task, arg);
+}
+
  #endif
  
  
+#if defined PIPE_ARCH_SSE
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
+#elif (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
+#else
  #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
  #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
+#endif
+
+#define RASTER_64 1
  
  #define TAG(x) x##_1
  #define NR_PLANES 1
@@ -512,12 +732,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
  #define NR_PLANES 8
  #include "lp_rast_tri_tmp.h"
  
-#ifdef PIPE_ARCH_SSE
-#undef BUILD_MASKS
-#undef BUILD_MASK_LINEAR
-#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
-#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
-#endif
+#undef RASTER_64
  
  #define TAG(x) x##_32_1
  #define NR_PLANES 1
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h

index 52f6e99..05d2242 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -50,9 +50,15 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
     int j;
  
     for (j = 0; j < NR_PLANES; j++) {
-      mask &= ~BUILD_MASK_LINEAR(c[j] - 1, 
-                                -plane[j].dcdx,
-                                plane[j].dcdy);
+#ifdef RASTER_64
+      mask &= ~BUILD_MASK_LINEAR(((c[j] - 1) >> (int64_t)FIXED_ORDER),
+                                 -plane[j].dcdx >> FIXED_ORDER,
+                                 plane[j].dcdy >> FIXED_ORDER);
+#else
+      mask &= ~BUILD_MASK_LINEAR((c[j] - 1),
+                                 -plane[j].dcdx,
+                                 plane[j].dcdy);
+#endif
     }
  
     /* Now pass to the shader:
@@ -79,17 +85,33 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
     partmask = 0;                /* outside one or more trivial accept planes */
  
     for (j = 0; j < NR_PLANES; j++) {
+#ifdef RASTER_64
+      int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER;
+      int32_t dcdy = plane[j].dcdy >> FIXED_ORDER;
+      const int32_t cox = plane[j].eo >> FIXED_ORDER;
+      const int32_t ei = (dcdy + dcdx - cox) << 2;
+      const int32_t cox_s = cox << 2;
+      const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s;
+      int32_t cdiff;
+      cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) -
+                            (int32_t)(c[j] >> (int64_t)FIXED_ORDER));
+      dcdx <<= 2;
+      dcdy <<= 2;
+#else
        const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
        const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
        const int64_t cox = IMUL64(plane[j].eo, 4);
-      const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
        const int64_t cio = IMUL64(ei, 4) - 1;
+      int32_t co, cdiff;
+      co = c[j] + cox;
+      cdiff = cio - cox;
+#endif
  
-      BUILD_MASKS(c[j] + cox,
-                 cio - cox,
-                 dcdx, dcdy, 
-                 &outmask,   /* sign bits from c[i][0..15] + cox */
-                 &partmask); /* sign bits from c[i][0..15] + cio */
+      BUILD_MASKS(co, cdiff,
+                  dcdx, dcdy,
+                  &outmask,   /* sign bits from c[i][0..15] + cox */
+                  &partmask); /* sign bits from c[i][0..15] + cio */
     }
  
     if (outmask == 0xffff)
@@ -179,14 +201,65 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
        c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x);
  
        {
-         const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
-         const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
-         const int64_t cox = IMUL64(plane[j].eo, 16);
-         const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
-         const int64_t cio = IMUL64(ei, 16) - 1;
-
-         BUILD_MASKS(c[j] + cox,
-                     cio - cox,
+#ifdef RASTER_64
+         /*
+          * Strip off lower FIXED_ORDER bits. Note that those bits from
+          * dcdx, dcdy, eo are always 0 (by definition).
+          * c values, however, are not. This means that for every
+          * addition of the form c + n*dcdx the lower FIXED_ORDER bits will
+          * NOT change. And those bits are not relevant to the sign bit (which
+          * is only what we need!) that is,
+          * sign(c + n*dcdx) == sign((c >> FIXED_ORDER) + n*(dcdx >> FIXED_ORDER))
+          * This means we can get away with using 32bit math for the most part.
+          * Only tricky part is the -1 adjustment for cdiff.
+          */
+         int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER;
+         int32_t dcdy = plane[j].dcdy >> FIXED_ORDER;
+         const int32_t cox = plane[j].eo >> FIXED_ORDER;
+         const int32_t ei = (dcdy + dcdx - cox) << 4;
+         const int32_t cox_s = cox << 4;
+         const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s;
+         int32_t cdiff;
+         /*
+          * Plausibility check to ensure the 32bit math works.
+          * Note that within a tile, the max we can move the edge function
+          * is essentially dcdx * TILE_SIZE + dcdy * TILE_SIZE.
+          * TILE_SIZE is 64, dcdx/dcdy are nominally 21 bit (for 8192 max size
+          * and 8 subpixel bits), I'd be happy with 2 bits more too (1 for
+          * increasing fb size to 16384, the required d3d11 value, another one
+          * because I'm not quite sure we can't be _just_ above the max value
+          * here). This gives us 30 bits max - hence if c would exceed that here
+          * that means the plane is either trivial reject for the whole tile
+          * (in which case the tri will not get binned), or trivial accept for
+          * the whole tile (in which case plane_mask will not include it).
+          */
+         assert((c[j] >> (int64_t)FIXED_ORDER) > (int32_t)0xb0000000 &&
+                (c[j] >> (int64_t)FIXED_ORDER) < (int32_t)0x3fffffff);
+         /*
+          * Note the fixup part is constant throughout the tile - thus could
+          * just calculate this and avoid _all_ 64bit math in rasterization
+          * (except exactly this fixup calc).
+          * In fact theoretically could move that even to setup, albeit that
+          * seems tricky (pre-bin certainly can have values larger than 32bit,
+          * and would need to communicate that fixup value through).
+          * And if we want to support msaa, we'd probably don't want to do the
+          * downscaling in setup in any case...
+          */
+         cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) -
+                               (int32_t)(c[j] >> (int64_t)FIXED_ORDER));
+         dcdx <<= 4;
+         dcdy <<= 4;
+#else
+         const int32_t dcdx = -plane[j].dcdx << 4;
+         const int32_t dcdy = plane[j].dcdy << 4;
+         const int32_t cox = plane[j].eo << 4;
+         const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int32_t)plane[j].eo;
+         const int32_t cio = (ei << 4) - 1;
+         int32_t co, cdiff;
+         co = c[j] + cox;
+         cdiff = cio - cox;
+#endif
+         BUILD_MASKS(co, cdiff,
                       dcdx, dcdy,
                       &outmask,   /* sign bits from c[i][0..15] + cox */
                       &partmask); /* sign bits from c[i][0..15] + cio */
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c

index 899f28d..fb52f5d 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -301,6 +301,15 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
     case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
        return 0;
     }
     /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c

index ddbb88e..bd85051 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -486,6 +486,11 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup,
                                     depth,
                                     stencil);
  
+   /*
+    * XXX: should make a full mask here for things like D24X8,
+    * otherwise we'll do a read-modify-write clear later which
+    * should be unnecessary.
+    */
     zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format,
                                         zmask32,
                                         smask8);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h

index 4451284..80acd74 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -105,10 +105,10 @@ struct lp_setup_context
     float pixel_offset;
     float line_width;
     float point_size;
-   uint8_t psize_slot;
-   uint8_t viewport_index_slot;
-   uint8_t layer_slot;
-   uint8_t face_slot;
+   int8_t psize_slot;
+   int8_t viewport_index_slot;
+   int8_t layer_slot;
+   int8_t face_slot;
  
     struct pipe_framebuffer_state fb;
     struct u_rect framebuffer;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c

index fac1cd6..f425825 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -644,19 +644,25 @@ try_setup_line( struct lp_setup_context *setup,
     line->inputs.layer = layer;
     line->inputs.viewport_index = viewport_index;
  
+   /*
+    * XXX: this code is mostly identical to the one in lp_setup_tri, except it
+    * uses 4 planes instead of 3. Could share the code (including the sse
+    * assembly, in fact we'd get the 4th plane for free).
+    * The only difference apart from storing the 4th plane would be some
+    * different shuffle for calculating dcdx/dcdy.
+    */
     for (i = 0; i < 4; i++) {
  
-      /* half-edge constants, will be interated over the whole render
+      /* half-edge constants, will be iterated over the whole render
         * target.
         */
        plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]);
  
-      
-      /* correct for top-left vs. bottom-left fill convention.  
-       */         
+      /* correct for top-left vs. bottom-left fill convention.
+       */
        if (plane[i].dcdx < 0) {
           /* both fill conventions want this - adjust for left edges */
-         plane[i].c++;            
+         plane[i].c++;
        }
        else if (plane[i].dcdx == 0) {
           if (setup->pixel_offset == 0) {
@@ -707,24 +713,24 @@ try_setup_line( struct lp_setup_context *setup,
        const struct u_rect *scissor =
           &setup->scissors[viewport_index];
  
-      plane[4].dcdx = -1;
+      plane[4].dcdx = -1 << 8;
        plane[4].dcdy = 0;
-      plane[4].c = 1-scissor->x0;
-      plane[4].eo = 1;
+      plane[4].c = (1-scissor->x0) << 8;
+      plane[4].eo = 1 << 8;
  
-      plane[5].dcdx = 1;
+      plane[5].dcdx = 1 << 8;
        plane[5].dcdy = 0;
-      plane[5].c = scissor->x1+1;
+      plane[5].c = (scissor->x1+1) << 8;
        plane[5].eo = 0;
  
        plane[6].dcdx = 0;
-      plane[6].dcdy = 1;
-      plane[6].c = 1-scissor->y0;
-      plane[6].eo = 1;
+      plane[6].dcdy = 1 << 8;
+      plane[6].c = (1-scissor->y0) << 8;
+      plane[6].eo = 1 << 8;
  
        plane[7].dcdx = 0;
-      plane[7].dcdy = -1;
-      plane[7].c = scissor->y1+1;
+      plane[7].dcdy = -1 << 8;
+      plane[7].c = (scissor->y1+1) << 8;
        plane[7].eo = 0;
     }
  
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c

index 14c389f..ddb6f0e 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -492,24 +492,24 @@ try_setup_point( struct lp_setup_context *setup,
     {
        struct lp_rast_plane *plane = GET_PLANES(point);
  
-      plane[0].dcdx = -1;
+      plane[0].dcdx = -1 << 8;
        plane[0].dcdy = 0;
-      plane[0].c = 1-bbox.x0;
-      plane[0].eo = 1;
+      plane[0].c = (1-bbox.x0) << 8;
+      plane[0].eo = 1 << 8;
  
-      plane[1].dcdx = 1;
+      plane[1].dcdx = 1 << 8;
        plane[1].dcdy = 0;
-      plane[1].c = bbox.x1+1;
+      plane[1].c = (bbox.x1+1) << 8;
        plane[1].eo = 0;
  
        plane[2].dcdx = 0;
-      plane[2].dcdy = 1;
-      plane[2].c = 1-bbox.y0;
-      plane[2].eo = 1;
+      plane[2].dcdy = 1 << 8;
+      plane[2].c = (1-bbox.y0) << 8;
+      plane[2].eo = 1 << 8;
  
        plane[3].dcdx = 0;
-      plane[3].dcdy = -1;
-      plane[3].c = bbox.y1+1;
+      plane[3].dcdy = -1 << 8;
+      plane[3].c = (bbox.y1+1) << 8;
        plane[3].eo = 0;
     }
  
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c

index b1671dd..aa24176 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -46,6 +46,9 @@
  
  #if defined(PIPE_ARCH_SSE)
  #include <emmintrin.h>
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+#include <altivec.h>
+#include "util/u_pwr8.h"
  #endif
  
  static inline int
@@ -65,11 +68,11 @@ fixed_to_float(int a)
  struct fixed_position {
     int32_t x[4];
     int32_t y[4];
-   int64_t area;
     int32_t dx01;
     int32_t dy01;
     int32_t dx20;
     int32_t dy20;
+   int64_t area;
  };
  
  
@@ -387,25 +390,21 @@ do_triangle_ccw(struct lp_setup_context *setup,
     plane = GET_PLANES(tri);
  
  #if defined(PIPE_ARCH_SSE)
-   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
-       setup->fb.height <= MAX_FIXED_LENGTH32 &&
-       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
-       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+   if (1) {
        __m128i vertx, verty;
        __m128i shufx, shufy;
-      __m128i dcdx, dcdy, c;
-      __m128i unused;
+      __m128i dcdx, dcdy;
+      __m128i cdx02, cdx13, cdy02, cdy13, c02, c13;
+      __m128i c01, c23, unused;
        __m128i dcdx_neg_mask;
        __m128i dcdy_neg_mask;
        __m128i dcdx_zero_mask;
-      __m128i top_left_flag;
-      __m128i c_inc_mask, c_inc;
+      __m128i top_left_flag, c_dec;
        __m128i eo, p0, p1, p2;
        __m128i zero = _mm_setzero_si128();
-      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
  
-      vertx = _mm_loadu_si128((__m128i *)position->x); /* vertex x coords */
-      verty = _mm_loadu_si128((__m128i *)position->y); /* vertex y coords */
+      vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */
+      verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */
  
        shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
        shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
@@ -419,42 +418,161 @@ do_triangle_ccw(struct lp_setup_context *setup,
  
        top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0);
  
-      c_inc_mask = _mm_or_si128(dcdx_neg_mask,
-                                _mm_and_si128(dcdx_zero_mask,
-                                              _mm_xor_si128(dcdy_neg_mask,
-                                                            top_left_flag)));
-
-      c_inc = _mm_srli_epi32(c_inc_mask, 31);
+      c_dec = _mm_or_si128(dcdx_neg_mask,
+                           _mm_and_si128(dcdx_zero_mask,
+                                         _mm_xor_si128(dcdy_neg_mask,
+                                                       top_left_flag)));
  
-      c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
-                        mm_mullo_epi32(dcdy, verty));
-
-      c = _mm_add_epi32(c, c_inc);
+      /*
+       * 64 bit arithmetic.
+       * Note we need _signed_ mul (_mm_mul_epi32) which we emulate.
+       */
+      cdx02 = mm_mullohi_epi32(dcdx, vertx, &cdx13);
+      cdy02 = mm_mullohi_epi32(dcdy, verty, &cdy13);
+      c02 = _mm_sub_epi64(cdx02, cdy02);
+      c13 = _mm_sub_epi64(cdx13, cdy13);
+      c02 = _mm_sub_epi64(c02, _mm_shuffle_epi32(c_dec,
+                                                 _MM_SHUFFLE(2,2,0,0)));
+      c13 = _mm_sub_epi64(c13, _mm_shuffle_epi32(c_dec,
+                                                 _MM_SHUFFLE(3,3,1,1)));
+
+      /*
+       * Useful for very small fbs/tris (or fewer subpixel bits) only:
+       * c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+       *                   mm_mullo_epi32(dcdy, verty));
+       *
+       * c = _mm_sub_epi32(c, c_dec);
+       */
  
        /* Scale up to match c:
         */
        dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
        dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
  
-      /* Calculate trivial reject values:
+      /*
+       * Calculate trivial reject values:
+       * Note eo cannot overflow even if dcdx/dcdy would already have
+       * 31 bits (which they shouldn't have). This is because eo
+       * is never negative (albeit if we rely on that need to be careful...)
         */
        eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
                           _mm_and_si128(dcdx_neg_mask, dcdx));
  
        /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
  
+      /*
+       * Pointless transpose which gets undone immediately in
+       * rasterization.
+       * It is actually difficult to do away with it - would essentially
+       * need GET_PLANES_DX, GET_PLANES_DY etc., but the calculations
+       * for this then would need to depend on the number of planes.
+       * The transpose is quite special here due to c being 64bit...
+       * The store has to be unaligned (unless we'd make the plane size
+       * a multiple of 128), and of course storing eo separately...
+       */
+      c01 = _mm_unpacklo_epi64(c02, c13);
+      c23 = _mm_unpackhi_epi64(c02, c13);
+      transpose2_64_2_32(&c01, &c23, &dcdx, &dcdy,
+                         &p0, &p1, &p2, &unused);
+      _mm_storeu_si128((__m128i *)&plane[0], p0);
+      plane[0].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+      _mm_storeu_si128((__m128i *)&plane[1], p1);
+      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(3,2,0,1));
+      plane[1].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+      _mm_storeu_si128((__m128i *)&plane[2], p2);
+      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2));
+      plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo);
+   } else
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+   /*
+    * XXX this code is effectively disabled for all practical purposes,
+    * as the allowed fb size is tiny if FIXED_ORDER is 8.
+    */
+   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
+       setup->fb.height <= MAX_FIXED_LENGTH32 &&
+       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
+       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+      unsigned int bottom_edge;
+      __m128i vertx, verty;
+      __m128i shufx, shufy;
+      __m128i dcdx, dcdy, c;
+      __m128i unused;
+      __m128i dcdx_neg_mask;
+      __m128i dcdy_neg_mask;
+      __m128i dcdx_zero_mask;
+      __m128i top_left_flag;
+      __m128i c_inc_mask, c_inc;
+      __m128i eo, p0, p1, p2;
+      __m128i_union vshuf_mask;
+      __m128i zero = vec_splats((unsigned char) 0);
+      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      vshuf_mask.i[0] = 0x07060504;
+      vshuf_mask.i[1] = 0x0B0A0908;
+      vshuf_mask.i[2] = 0x03020100;
+      vshuf_mask.i[3] = 0x0F0E0D0C;
+#else
+      vshuf_mask.i[0] = 0x00010203;
+      vshuf_mask.i[1] = 0x0C0D0E0F;
+      vshuf_mask.i[2] = 0x04050607;
+      vshuf_mask.i[3] = 0x08090A0B;
+#endif
+
+      /* vertex x coords */
+      vertx = vec_load_si128((const uint32_t *) position->x);
+      /* vertex y coords */
+      verty = vec_load_si128((const uint32_t *) position->y);
+
+      shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
+      shufy = vec_perm (verty, verty, vshuf_mask.m128i);
+
+      dcdx = vec_sub_epi32(verty, shufy);
+      dcdy = vec_sub_epi32(vertx, shufx);
+
+      dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
+      dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
+      dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
+
+      bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
+      top_left_flag = (__m128i) vec_splats(bottom_edge);
+
+      c_inc_mask = vec_or(dcdx_neg_mask,
+                                vec_and(dcdx_zero_mask,
+                                              vec_xor(dcdy_neg_mask,
+                                                            top_left_flag)));
+
+      c_inc = vec_srli_epi32(c_inc_mask, 31);
+
+      c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
+                        vec_mullo_epi32(dcdy, verty));
+
+      c = vec_add_epi32(c, c_inc);
+
+      /* Scale up to match c:
+       */
+      dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
+      dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+                         vec_and(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
        /* Pointless transpose which gets undone immediately in
         * rasterization:
         */
        transpose4_epi32(&c, &dcdx, &dcdy, &eo,
                         &p0, &p1, &p2, &unused);
  
-#define STORE_PLANE(plane, vec) do {                 \
-         _mm_store_si128((__m128i *)&temp_vec, vec); \
-         plane.c    = (int64_t)temp_vec[0];          \
-         plane.dcdx = temp_vec[1];                   \
-         plane.dcdy = temp_vec[2];                   \
-         plane.eo   = temp_vec[3];                   \
+#define STORE_PLANE(plane, vec) do {                  \
+         vec_store_si128((uint32_t *)&temp_vec, vec); \
+         plane.c    = (int64_t)temp_vec[0];           \
+         plane.dcdx = temp_vec[1];                    \
+         plane.dcdy = temp_vec[2];                    \
+         plane.eo   = temp_vec[3];                    \
        } while(0)
  
        STORE_PLANE(plane[0], p0);
@@ -473,17 +591,17 @@ do_triangle_ccw(struct lp_setup_context *setup,
        plane[2].dcdx = position->dy20;
    
        for (i = 0; i < 3; i++) {
-         /* half-edge constants, will be interated over the whole render
+         /* half-edge constants, will be iterated over the whole render
            * target.
            */
           plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
-               IMUL64(plane[i].dcdy, position->y[i]);
+                      IMUL64(plane[i].dcdy, position->y[i]);
  
           /* correct for top-left vs. bottom-left fill convention.
-          */         
+          */
           if (plane[i].dcdx < 0) {
              /* both fill conventions want this - adjust for left edges */
-            plane[i].c++;            
+            plane[i].c++;
           }
           else if (plane[i].dcdx == 0) {
              if (setup->bottom_edge_rule == 0){
@@ -517,19 +635,19 @@ do_triangle_ccw(struct lp_setup_context *setup,
     }
  
     if (0) {
-      debug_printf("p0: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+      debug_printf("p0: %"PRIx64"/%08x/%08x/%08x\n",
                     plane[0].c,
                     plane[0].dcdx,
                     plane[0].dcdy,
                     plane[0].eo);
-      
-      debug_printf("p1: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+
+      debug_printf("p1: %"PRIx64"/%08x/%08x/%08x\n",
                     plane[1].c,
                     plane[1].dcdx,
                     plane[1].dcdy,
                     plane[1].eo);
-      
-      debug_printf("p2: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+
+      debug_printf("p2: %"PRIx64"/%08x/%08x/%08x\n",
                     plane[2].c,
                     plane[2].dcdx,
                     plane[2].dcdy,
@@ -558,24 +676,24 @@ do_triangle_ccw(struct lp_setup_context *setup,
     if (nr_planes == 7) {
        const struct u_rect *scissor = &setup->scissors[viewport_index];
  
-      plane[3].dcdx = -1;
+      plane[3].dcdx = -1 << 8;
        plane[3].dcdy = 0;
-      plane[3].c = 1-scissor->x0;
-      plane[3].eo = 1;
+      plane[3].c = (1-scissor->x0) << 8;
+      plane[3].eo = 1 << 8;
  
-      plane[4].dcdx = 1;
+      plane[4].dcdx = 1 << 8;
        plane[4].dcdy = 0;
-      plane[4].c = scissor->x1+1;
+      plane[4].c = (scissor->x1+1) << 8;
        plane[4].eo = 0;
  
        plane[5].dcdx = 0;
-      plane[5].dcdy = 1;
-      plane[5].c = 1-scissor->y0;
-      plane[5].eo = 1;
+      plane[5].dcdy = 1 << 8;
+      plane[5].c = (1-scissor->y0) << 8;
+      plane[5].eo = 1 << 8;
  
        plane[6].dcdx = 0;
-      plane[6].dcdy = -1;
-      plane[6].c = scissor->y1+1;
+      plane[6].dcdy = -1 << 8;
+      plane[6].c = (scissor->y1+1) << 8;
        plane[6].eo = 0;
     }
  
@@ -590,7 +708,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
  static inline uint32_t 
  floor_pot(uint32_t n)
  {
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+#if defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
     if (n == 0)
        return 0;
  
@@ -738,9 +856,9 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
  
           ei[i] = (plane[i].dcdy - 
                    plane[i].dcdx - 
-                  plane[i].eo) << TILE_ORDER;
+                  (int64_t)plane[i].eo) << TILE_ORDER;
  
-         eo[i] = plane[i].eo << TILE_ORDER;
+         eo[i] = (int64_t)plane[i].eo << TILE_ORDER;
           xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER);
           ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER;
        }
@@ -848,29 +966,71 @@ static void retry_triangle_ccw( struct lp_setup_context *setup,
  
  /**
   * Calculate fixed position data for a triangle
+ * It is unfortunate we need to do that here (as we need area
+ * calculated in fixed point), as there's quite some code duplication
+ * to what is done in the jit setup prog.
   */
  static inline void
-calc_fixed_position( struct lp_setup_context *setup,
-                     struct fixed_position* position,
-                     const float (*v0)[4],
-                     const float (*v1)[4],
-                     const float (*v2)[4])
+calc_fixed_position(struct lp_setup_context *setup,
+                    struct fixed_position* position,
+                    const float (*v0)[4],
+                    const float (*v1)[4],
+                    const float (*v2)[4])
  {
+   /*
+    * The rounding may not be quite the same with PIPE_ARCH_SSE
+    * (util_iround right now only does nearest/even on x87,
+    * otherwise nearest/away-from-zero).
+    * Both should be acceptable, I think.
+    */
+#if defined(PIPE_ARCH_SSE)
+   __m128d v0r, v1r, v2r;
+   __m128 vxy0xy2, vxy1xy0;
+   __m128i vxy0xy2i, vxy1xy0i;
+   __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
+   __m128 pix_offset = _mm_set1_ps(setup->pixel_offset);
+   __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
+   v0r = _mm_load_sd((const double *)v0[0]);
+   v1r = _mm_load_sd((const double *)v1[0]);
+   v2r = _mm_load_sd((const double *)v2[0]);
+   vxy0xy2 = _mm_castpd_ps(_mm_unpacklo_pd(v0r, v2r));
+   vxy1xy0 = _mm_castpd_ps(_mm_unpacklo_pd(v1r, v0r));
+   vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
+   vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
+   vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);
+   vxy1xy0 = _mm_mul_ps(vxy1xy0, fixed_one);
+   vxy0xy2i = _mm_cvtps_epi32(vxy0xy2);
+   vxy1xy0i = _mm_cvtps_epi32(vxy1xy0);
+   dxdy0120 = _mm_sub_epi32(vxy0xy2i, vxy1xy0i);
+   _mm_store_si128((__m128i *)&position->dx01, dxdy0120);
+   /*
+    * For the mul, would need some more shuffles, plus emulation
+    * for the signed mul (without sse41), so don't bother.
+    */
+   x0x2y0y2 = _mm_shuffle_epi32(vxy0xy2i, _MM_SHUFFLE(3,1,2,0));
+   x1x0y1y0 = _mm_shuffle_epi32(vxy1xy0i, _MM_SHUFFLE(3,1,2,0));
+   x0120 = _mm_unpacklo_epi32(x0x2y0y2, x1x0y1y0);
+   y0120 = _mm_unpackhi_epi32(x0x2y0y2, x1x0y1y0);
+   _mm_store_si128((__m128i *)&position->x[0], x0120);
+   _mm_store_si128((__m128i *)&position->y[0], y0120);
+
+#else
     position->x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
     position->x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
     position->x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
-   position->x[3] = 0;
+   position->x[3] = 0; // should be unused
  
     position->y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
     position->y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
     position->y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-   position->y[3] = 0;
+   position->y[3] = 0; // should be unused
  
     position->dx01 = position->x[0] - position->x[1];
     position->dy01 = position->y[0] - position->y[1];
  
     position->dx20 = position->x[2] - position->x[0];
     position->dy20 = position->y[2] - position->y[0];
+#endif
  
     position->area = IMUL64(position->dx01, position->dy20) -
           IMUL64(position->dx20, position->dy01);
@@ -932,12 +1092,12 @@ rotate_fixed_position_12( struct fixed_position* position )
  /**
   * Draw triangle if it's CW, cull otherwise.
   */
-static void triangle_cw( struct lp_setup_context *setup,
-                        const float (*v0)[4],
-                        const float (*v1)[4],
-                        const float (*v2)[4] )
+static void triangle_cw(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4])
  {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
  
     calc_fixed_position(setup, &position, v0, v1, v2);
  
@@ -953,12 +1113,12 @@ static void triangle_cw( struct lp_setup_context *setup,
  }
  
  
-static void triangle_ccw( struct lp_setup_context *setup,
-                          const float (*v0)[4],
-                          const float (*v1)[4],
-                          const float (*v2)[4])
+static void triangle_ccw(struct lp_setup_context *setup,
+                         const float (*v0)[4],
+                         const float (*v1)[4],
+                         const float (*v2)[4])
  {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
  
     calc_fixed_position(setup, &position, v0, v1, v2);
  
@@ -969,12 +1129,12 @@ static void triangle_ccw( struct lp_setup_context *setup,
  /**
   * Draw triangle whether it's CW or CCW.
   */
-static void triangle_both( struct lp_setup_context *setup,
-                          const float (*v0)[4],
-                          const float (*v1)[4],
-                          const float (*v2)[4] )
+static void triangle_both(struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
  {
-   struct fixed_position position;
+   PIPE_ALIGN_VAR(16) struct fixed_position position;
     struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
  
     if (lp_context->active_statistics_queries &&
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c

index f5bcfb2..34961cb 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -48,21 +48,26 @@
  static void
  compute_vertex_info(struct llvmpipe_context *llvmpipe)
  {
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   const struct tgsi_shader_info *fsInfo = &llvmpipe->fs->info.base;
     struct vertex_info *vinfo = &llvmpipe->vertex_info;
     int vs_index;
     uint i;
  
     draw_prepare_shader_outputs(llvmpipe->draw);
  
-   llvmpipe->color_slot[0] = 0;
-   llvmpipe->color_slot[1] = 0;
-   llvmpipe->bcolor_slot[0] = 0;
-   llvmpipe->bcolor_slot[1] = 0;
-   llvmpipe->viewport_index_slot = 0;
-   llvmpipe->layer_slot = 0;
-   llvmpipe->face_slot = 0;
-   llvmpipe->psize_slot = 0;
+   /*
+    * Those can't actually be 0 (because pos is always at 0).
+    * But use ints anyway to avoid confusion (in vs outputs, they
+    * can very well be at pos 0).
+    */
+   llvmpipe->color_slot[0] = -1;
+   llvmpipe->color_slot[1] = -1;
+   llvmpipe->bcolor_slot[0] = -1;
+   llvmpipe->bcolor_slot[1] = -1;
+   llvmpipe->viewport_index_slot = -1;
+   llvmpipe->layer_slot = -1;
+   llvmpipe->face_slot = -1;
+   llvmpipe->psize_slot = -1;
  
     /*
      * Match FS inputs against VS outputs, emitting the necessary
@@ -73,60 +78,49 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
     vinfo->num_attribs = 0;
  
     vs_index = draw_find_shader_output(llvmpipe->draw,
-                                      TGSI_SEMANTIC_POSITION,
-                                      0);
+                                      TGSI_SEMANTIC_POSITION, 0);
  
-   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
  
-   for (i = 0; i < lpfs->info.base.num_inputs; i++) {
+   for (i = 0; i < fsInfo->num_inputs; i++) {
        /*
         * Search for each input in current vs output:
         */
-
        vs_index = draw_find_shader_output(llvmpipe->draw,
-                                         lpfs->info.base.input_semantic_name[i],
-                                         lpfs->info.base.input_semantic_index[i]);
+                                         fsInfo->input_semantic_name[i],
+                                         fsInfo->input_semantic_index[i]);
  
-      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
-          lpfs->info.base.input_semantic_index[i] < 2) {
-         int idx = lpfs->info.base.input_semantic_index[i];
-         llvmpipe->color_slot[idx] = vinfo->num_attribs;
+      if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+          fsInfo->input_semantic_index[i] < 2) {
+         int idx = fsInfo->input_semantic_index[i];
+         llvmpipe->color_slot[idx] = (int)vinfo->num_attribs;
        }
  
-      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
-         llvmpipe->face_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) {
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
+         llvmpipe->face_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
        /*
         * For vp index and layer, if the fs requires them but the vs doesn't
-       * provide them, store the slot - we'll later replace the data directly
-       * with zero (as required by ARB_fragment_layer_viewport). This is
-       * because draw itself just redirects them to whatever was at output 0.
-       * We'll also store the real vpindex/layer slot for setup use.
+       * provide them, draw (vbuf) will give us the required 0 (slot -1).
+       * (This means in this case we'll also use those slots in setup, which
+       * isn't necessary but they'll contain the correct (0) value.)
         */
-      } else if (lpfs->info.base.input_semantic_name[i] ==
+      } else if (fsInfo->input_semantic_name[i] ==
                   TGSI_SEMANTIC_VIEWPORT_INDEX) {
-         if (vs_index >= 0) {
-            llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         }
-         else {
-            llvmpipe->fake_vpindex_slot = vinfo->num_attribs;
-         }
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
-         if (vs_index >= 0) {
-            llvmpipe->layer_slot = vinfo->num_attribs;
-         }
-         else {
-            llvmpipe->fake_layer_slot = vinfo->num_attribs;
-         }
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->viewport_index_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+      } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+         llvmpipe->layer_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
        } else {
           /*
-          * Emit the requested fs attribute for all but position.
+          * Note that we'd actually want to skip position (as we won't use
+          * the attribute in the fs) but can't. The reason is that we don't
+          * actually have a input/output map for setup (even though it looks
+          * like we do...). Could adjust for this though even without a map
+          * (in llvmpipe_create_fs_state()).
            */
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
        }
     }
  
@@ -137,8 +131,8 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                           TGSI_SEMANTIC_BCOLOR, i);
  
        if (vs_index >= 0) {
-         llvmpipe->bcolor_slot[i] = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+         llvmpipe->bcolor_slot[i] = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
        }
     }
  
@@ -148,29 +142,29 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                        TGSI_SEMANTIC_PSIZE, 0);
  
     if (vs_index >= 0) {
-      llvmpipe->psize_slot = vinfo->num_attribs;
-      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      llvmpipe->psize_slot = (int)vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
     }
  
     /* Figure out if we need viewport index (if it wasn't already in fs input) */
-   if (llvmpipe->viewport_index_slot == 0) {
+   if (llvmpipe->viewport_index_slot < 0) {
        vs_index = draw_find_shader_output(llvmpipe->draw,
                                           TGSI_SEMANTIC_VIEWPORT_INDEX,
                                           0);
        if (vs_index >= 0) {
-         llvmpipe->viewport_index_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->viewport_index_slot =(int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
        }
     }
  
     /* Figure out if we need layer (if it wasn't already in fs input) */
-   if (llvmpipe->layer_slot == 0) {
+   if (llvmpipe->layer_slot < 0) {
        vs_index = draw_find_shader_output(llvmpipe->draw,
                                           TGSI_SEMANTIC_LAYER,
                                           0);
        if (vs_index >= 0) {
-         llvmpipe->layer_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         llvmpipe->layer_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
        }
     }
  
@@ -197,10 +191,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
        llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
     }
        
-   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
-                          LP_NEW_FS |
+   if (llvmpipe->dirty & (LP_NEW_FS |
                            LP_NEW_VS))
-      compute_vertex_info( llvmpipe );
+      compute_vertex_info(llvmpipe);
  
     if (llvmpipe->dirty & (LP_NEW_FS |
                            LP_NEW_FRAMEBUFFER |
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c

index 079083e..83ff976 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -2695,34 +2695,35 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
  
        switch (shader->info.base.input_interpolate[i]) {
        case TGSI_INTERPOLATE_CONSTANT:
-        shader->inputs[i].interp = LP_INTERP_CONSTANT;
-        break;
+         shader->inputs[i].interp = LP_INTERP_CONSTANT;
+         break;
        case TGSI_INTERPOLATE_LINEAR:
-        shader->inputs[i].interp = LP_INTERP_LINEAR;
-        break;
+         shader->inputs[i].interp = LP_INTERP_LINEAR;
+         break;
        case TGSI_INTERPOLATE_PERSPECTIVE:
-        shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
-        break;
+         shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+         break;
        case TGSI_INTERPOLATE_COLOR:
-        shader->inputs[i].interp = LP_INTERP_COLOR;
-        break;
+         shader->inputs[i].interp = LP_INTERP_COLOR;
+         break;
        default:
-        assert(0);
-        break;
+         assert(0);
+         break;
        }
  
        switch (shader->info.base.input_semantic_name[i]) {
        case TGSI_SEMANTIC_FACE:
-        shader->inputs[i].interp = LP_INTERP_FACING;
-        break;
+         shader->inputs[i].interp = LP_INTERP_FACING;
+         break;
        case TGSI_SEMANTIC_POSITION:
-        /* Position was already emitted above
-         */
-        shader->inputs[i].interp = LP_INTERP_POSITION;
-        shader->inputs[i].src_index = 0;
-        continue;
+         /* Position was already emitted above
+          */
+         shader->inputs[i].interp = LP_INTERP_POSITION;
+         shader->inputs[i].src_index = 0;
+         continue;
        }
  
+      /* XXX this is a completely pointless index map... */
        shader->inputs[i].src_index = i+1;
     }
  
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c

index d7ba5c8..6a4fbbb 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -372,9 +372,9 @@ load_attribute(struct gallivm_state *gallivm,
     /* Potentially modify it according to twoside, etc:
      */
     if (key->twoside) {
-      if (vert_attr == key->color_slot && key->bcolor_slot > 0)
+      if (vert_attr == key->color_slot && key->bcolor_slot >= 0)
           lp_twoside(gallivm, args, key, key->bcolor_slot, attribv);
-      else if (vert_attr == key->spec_slot && key->bspec_slot > 0)
+      else if (vert_attr == key->spec_slot && key->bspec_slot >= 0)
           lp_twoside(gallivm, args, key, key->bspec_slot, attribv);
     }
  }
@@ -602,13 +602,6 @@ emit_tri_coef( struct gallivm_state *gallivm,
            */
           break;
  
-      case LP_INTERP_ZERO:
-         /*
-          * The information we get from the output is bogus, replace it
-          * with zero.
-          */
-         emit_constant_coef4(gallivm, args, slot+1, args->bld.zero);
-         break;
        case LP_INTERP_FACING:
           emit_facing_coef(gallivm, args, slot+1);
           break;
@@ -879,13 +872,7 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
     key->pad = 0;
     memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]);
     for (i = 0; i < key->num_inputs; i++) {
-      if (key->inputs[i].interp == LP_INTERP_CONSTANT) {
-         if (key->inputs[i].src_index == lp->fake_vpindex_slot ||
-             key->inputs[i].src_index == lp->fake_layer_slot) {
-            key->inputs[i].interp = LP_INTERP_ZERO;
-         }
-      }
-      else if (key->inputs[i].interp == LP_INTERP_COLOR) {
+      if (key->inputs[i].interp == LP_INTERP_COLOR) {
           if (lp->rasterizer->flatshade)
              key->inputs[i].interp = LP_INTERP_CONSTANT;
           else
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h

index 6cee6fe..9ad2444 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -17,10 +17,10 @@ struct lp_setup_variant_list_item
  struct lp_setup_variant_key {
     unsigned size:16;
     unsigned num_inputs:8;
-   unsigned color_slot:8;
-   unsigned bcolor_slot:8;
-   unsigned spec_slot:8;
-   unsigned bspec_slot:8;
+   int color_slot:8;
+   int bcolor_slot:8;
+   int spec_slot:8;
+   int bspec_slot:8;
     unsigned flatshade_first:1;
     unsigned pixel_center_half:1;
     unsigned twoside:1;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c

index 7b19174..9139b83 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -184,7 +184,7 @@ add_blend_test(struct gallivm_state *gallivm,
  
     LLVMBuildStore(builder, res, res_ptr);
  
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
  
     gallivm_verify_function(gallivm, func);
  
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c

index a30f35c..02a6319 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -140,7 +140,7 @@ add_conv_test(struct gallivm_state *gallivm,
        LLVMBuildStore(builder, dst[i], ptr);
     }
  
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
  
     gallivm_verify_function(gallivm, func);
  
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h

index d09a0ab..d1fdd75 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -390,6 +390,9 @@ enum SVSemantic
     SV_VERTEX_STRIDE,
     SV_INVOCATION_INFO,
     SV_THREAD_KILL,
+   SV_BASEVERTEX,
+   SV_BASEINSTANCE,
+   SV_DRAWID,
     SV_UNDEFINED,
     SV_LAST
  };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp

index dca799d..1bf7240 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -407,7 +407,7 @@ BuildUtil::loadImm(Value *dst, float f)
  Value *
  BuildUtil::loadImm(Value *dst, double d)
  {
-   return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(), mkImm(d));
+   return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(8), mkImm(d));
  }
  
  Value *
@@ -499,7 +499,7 @@ BuildUtil::DataArray::acquire(ValueMap &m, int i, int c)
  
        return v;
     } else {
-      return up->getScratch();
+      return up->getScratch(eltSize);
     }
  }
  
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h

index 8f3bf77..d171f64 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -295,7 +295,7 @@ BuildUtil::mkOp3v(operation op, DataType ty, Value *dst,
  inline LValue *
  BuildUtil::mkLoadv(DataType ty, Symbol *mem, Value *ptr)
  {
-   LValue *dst = getScratch();
+   LValue *dst = getScratch(typeSizeof(ty));
     mkLoad(ty, dst, mem, ptr);
     return dst;
  }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h

index b49bf9d..4504240 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -124,6 +124,7 @@ struct nv50_ir_prog_info
     union {
        struct {
           uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */
+         bool usesDrawParameters;
        } vp;
        struct {
           uint8_t inputPatchSize;
@@ -160,8 +161,9 @@ struct nv50_ir_prog_info
        uint8_t clipDistances;     /* number of clip distance outputs */
        uint8_t cullDistances;     /* number of cull distance outputs */
        int8_t genUserClip;        /* request user clip planes for ClipVertex */
+      uint8_t auxCBSlot;         /* constant buffer index of UCP/draw data */
        uint16_t ucpBase;          /* base address for UCPs */
-      uint8_t ucpCBSlot;         /* constant buffer index of UCP data */
+      uint16_t drawInfoBase;     /* base address for draw parameters */
        uint8_t pointSize;         /* output index for PointSize */
        uint8_t instanceId;        /* system value index of InstanceID */
        uint8_t vertexId;          /* system value index of VertexID */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp

index e9ddd36..ec74e7a 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -740,6 +740,7 @@ CodeEmitterGM107::emitF2F()
     emitCC   (0x2f);
     emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
     emitFMZ  (0x2c, 1);
+   emitField(0x29, 1, insn->subOp);
     emitRND  (0x27, rnd, 0x2a);
     emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
     emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp

index 1d4f0d9..0b28047 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1030,7 +1030,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i)
  
        // for 8/16 source types, the byte/word is in subOp. word 1 is
        // represented as 2.
-      code[1] |= i->subOp << 0x17;
+      if (!isFloatType(i->sType))
+         code[1] |= i->subOp << 0x17;
+      else
+         code[1] |= i->subOp << 0x18;
  
        if (sat)
           code[0] |= 0x20;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp

index b233860..7b313f3 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -319,6 +319,10 @@ unsigned int Instruction::srcMask(unsigned int s) const
           x |= 2;
        return x;
     }
+   case TGSI_OPCODE_PK2H:
+      return 0x3;
+   case TGSI_OPCODE_UP2H:
+      return 0x1;
     default:
        break;
     }
@@ -348,7 +352,7 @@ static nv50_ir::DataFile translateFile(uint file)
     case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
     case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
     case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
-   case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
+   //case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
     case TGSI_FILE_SAMPLER:
     case TGSI_FILE_NULL:
     default:
@@ -377,6 +381,9 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
     case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
     case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
     case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL;
+   case TGSI_SEMANTIC_BASEVERTEX: return nv50_ir::SV_BASEVERTEX;
+   case TGSI_SEMANTIC_BASEINSTANCE: return nv50_ir::SV_BASEINSTANCE;
+   case TGSI_SEMANTIC_DRAWID:     return nv50_ir::SV_DRAWID;
     default:
        assert(0);
        return nv50_ir::SV_CLOCK;
@@ -449,6 +456,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
     case TGSI_OPCODE_ATOMUMAX:
     case TGSI_OPCODE_UBFE:
     case TGSI_OPCODE_UMSB:
+   case TGSI_OPCODE_UP2H:
        return nv50_ir::TYPE_U32;
     case TGSI_OPCODE_I2F:
     case TGSI_OPCODE_I2D:
@@ -513,10 +521,12 @@ nv50_ir::DataType Instruction::inferDstType() const
     case TGSI_OPCODE_DSGE:
     case TGSI_OPCODE_DSLT:
     case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_PK2H:
        return nv50_ir::TYPE_U32;
     case TGSI_OPCODE_I2F:
     case TGSI_OPCODE_U2F:
     case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_UP2H:
        return nv50_ir::TYPE_F32;
     case TGSI_OPCODE_I2D:
     case TGSI_OPCODE_U2D:
@@ -861,7 +871,7 @@ bool Source::scanSource()
     clipVertexOutput = -1;
  
     textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
-   resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+   //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
  
     info->immd.bufSize = 0;
  
@@ -1128,6 +1138,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
        case TGSI_SEMANTIC_SAMPLEPOS:
           info->prop.fp.sampleInterp = 1;
           break;
+      case TGSI_SEMANTIC_BASEVERTEX:
+      case TGSI_SEMANTIC_BASEINSTANCE:
+      case TGSI_SEMANTIC_DRAWID:
+         info->prop.vp.usesDrawParameters = true;
+         break;
        default:
           break;
        }
@@ -1144,6 +1159,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
           }
        }
        break;
+/*
     case TGSI_FILE_RESOURCE:
        for (i = first; i <= last; ++i) {
           resources[i].target = decl->Resource.Resource;
@@ -1151,6 +1167,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
           resources[i].slot = i;
        }
        break;
+*/
     case TGSI_FILE_SAMPLER_VIEW:
        for (i = first; i <= last; ++i)
           textureViews[i].target = decl->SamplerView.Resource;
@@ -1216,11 +1233,13 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
           if (src.isIndirect(0))
              mainTempsInLMem = true;
        } else
+/*
        if (src.getFile() == TGSI_FILE_RESOURCE) {
           if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
              info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
                 0x1 : 0x2;
        } else
+*/
        if (src.getFile() == TGSI_FILE_OUTPUT) {
           if (src.isIndirect(0)) {
              // We don't know which one is accessed, just mark everything for
@@ -1271,9 +1290,11 @@ Instruction::getTexture(const tgsi::Source *code, int s) const
     unsigned int r;
  
     switch (getSrc(s).getFile()) {
+/*
     case TGSI_FILE_RESOURCE:
        r = getSrc(s).getIndex(0);
        return translateTexture(code->resources.at(r).target);
+*/
     case TGSI_FILE_SAMPLER_VIEW:
        r = getSrc(s).getIndex(0);
        return translateTexture(code->textureViews.at(r).target);
@@ -1639,8 +1660,6 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
           // don't load masked inputs, won't be assigned a slot
           if (!ptr && !(info->in[idx].mask & (1 << swz)))
              return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
-         if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
-            return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
           return interpolate(src, c, shiftAddress(ptr));
        } else
        if (prog->getType() == Program::TYPE_GEOMETRY) {
@@ -1681,7 +1700,7 @@ Converter::acquireDst(int d, int c)
     const int idx = dst.getIndex(0);
     const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
  
-   if (dst.isMasked(c) || f == TGSI_FILE_RESOURCE)
+   if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
        return NULL;
  
     if (dst.isIndirect(0) ||
@@ -2799,6 +2818,21 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
        FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
           mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c));
        break;
+   case TGSI_OPCODE_PK2H:
+      val0 = getScratch();
+      val1 = getScratch();
+      mkCvt(OP_CVT, TYPE_F16, val0, TYPE_F32, fetchSrc(0, 0));
+      mkCvt(OP_CVT, TYPE_F16, val1, TYPE_F32, fetchSrc(0, 1));
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp3(OP_INSBF, TYPE_U32, dst0[c], val1, mkImm(0x1010), val0);
+      break;
+   case TGSI_OPCODE_UP2H:
+      src0 = fetchSrc(0, 0);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         geni = mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F16, src0);
+         geni->subOp = c & 1;
+      }
+      break;
     case TGSI_OPCODE_EMIT:
        /* export the saved viewport index */
        if (viewport != NULL) {
@@ -3252,7 +3286,7 @@ Converter::handleUserClipPlanes()
  
     for (c = 0; c < 4; ++c) {
        for (i = 0; i < info->io.genUserClip; ++i) {
-         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.ucpCBSlot,
+         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.auxCBSlot,
                                  TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
           Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
           if (c == 0)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp

index e67bf3e..6530078 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1576,6 +1576,17 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
        ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
        ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
        break;
+   case SV_BASEVERTEX:
+   case SV_BASEINSTANCE:
+   case SV_DRAWID:
+      ld = bld.mkLoad(TYPE_U32, i->getDef(0),
+                      bld.mkSymbol(FILE_MEMORY_CONST,
+                                   prog->driver->io.auxCBSlot,
+                                   TYPE_U32,
+                                   prog->driver->io.drawInfoBase +
+                                   4 * (sv - SV_BASEVERTEX)),
+                      NULL);
+      break;
     default:
        if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
           vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp

index 022626c..f5c590e 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -676,23 +676,22 @@ ConstantFolding::expr(Instruction *i,
     switch (i->op) {
     case OP_MAD:
     case OP_FMA: {
-      i->op = OP_ADD;
+      ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
  
-      /* Move the immediate to the second arg, otherwise the ADD operation
-       * won't be emittable
-       */
-      i->setSrc(1, i->getSrc(0));
+      // Move the immediate into position 1, where we know it might be
+      // emittable. However it might not be anyways, as there may be other
+      // restrictions, so move it into a separate LValue.
+      bld.setPosition(i, false);
+      i->op = OP_ADD;
+      i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
        i->setSrc(0, i->getSrc(2));
        i->src(0).mod = i->src(2).mod;
        i->setSrc(2, NULL);
  
-      ImmediateValue src0;
        if (i->src(0).getImmediate(src0))
-         expr(i, src0, *i->getSrc(1)->asImm());
-      if (i->saturate && !prog->getTarget()->isSatSupported(i)) {
-         bld.setPosition(i, false);
-         i->setSrc(1, bld.loadImm(NULL, res.data.u32));
-      }
+         expr(i, src0, src1);
+      else
+         opnd(i, src1, 1);
        break;
     }
     case OP_PFETCH:
@@ -1889,6 +1888,9 @@ AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
           arg = shift->getSrc(0);
           offset = imm.reg.data.u32;
        }
+      // We just AND'd the high bits away, which means this is effectively an
+      // unsigned value.
+      cvt->sType = TYPE_U32;
     } else if (insn->op == OP_SHR &&
                insn->sType == cvt->sType &&
                insn->src(1).getImmediate(imm)) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp

index 19637ce..014c652 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -295,6 +295,9 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
     case SV_SAMPLE_INDEX:   return 0;
     case SV_SAMPLE_POS:     return 0;
     case SV_SAMPLE_MASK:    return 0;
+   case SV_BASEVERTEX:     return 0;
+   case SV_BASEINSTANCE:   return 0;
+   case SV_DRAWID:         return 0;
     default:
        return 0xffffffff;
     }
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c

index 670b0c8..cd44aa1 100644 (file)
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -112,7 +112,7 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[],
     info.bin.sourceRep = NV50_PROGRAM_IR_TGSI;
     info.bin.source = tokens;
  
-   info.io.ucpCBSlot = 15;
+   info.io.auxCBSlot = 15;
     info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
  
     info.io.resInfoCBSlot = 15;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c

index a6065e4..4ca9e5c 100644 (file)
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -147,6 +147,12 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
     if (nv_dbg)
        nouveau_mesa_debug = atoi(nv_dbg);
  
+   /* These must be set before any failure is possible, as the cleanup
+    * paths assume they're responsible for deleting them.
+    */
+   screen->drm = nouveau_drm(&dev->object);
+   screen->device = dev;
+
     /*
      * this is initialized to 1 in nouveau_drm_screen_create after screen
      * is fully constructed and added to the global screen list.
@@ -175,7 +181,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
                              data, size, &screen->channel);
     if (ret)
        return ret;
-   screen->device = dev;
  
     ret = nouveau_client_new(screen->device, &screen->client);
     if (ret)
@@ -229,6 +234,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
  void
  nouveau_screen_fini(struct nouveau_screen *screen)
  {
+   int fd = screen->drm->fd;
+
     nouveau_mm_destroy(screen->mm_GART);
     nouveau_mm_destroy(screen->mm_VRAM);
  
@@ -238,6 +245,8 @@ nouveau_screen_fini(struct nouveau_screen *screen)
     nouveau_object_del(&screen->channel);
  
     nouveau_device_del(&screen->device);
+   nouveau_drm_del(&screen->drm);
+   close(fd);
  }
  
  static void
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h

index 328646f..28c4760 100644 (file)
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -17,6 +17,7 @@ struct nouveau_bo;
  
  struct nouveau_screen {
     struct pipe_screen base;
+   struct nouveau_drm *drm;
     struct nouveau_device *device;
     struct nouveau_object *channel;
     struct nouveau_client *client;
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h

index 58df5ee..809e971 100644 (file)
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@@ -114,6 +114,11 @@ struct nouveau_vp3_decoder {
     unsigned fence_seq, fw_sizes, last_frame_num, tmp_stride, ref_stride;
  
     unsigned bsp_idx, vp_idx, ppp_idx;
+
+   /* End of the bsp bo where new data should be appended between one begin/end
+    * frame.
+    */
+   char *bsp_ptr;
  };
  
  struct comm {
@@ -208,11 +213,15 @@ nouveau_vp3_load_firmware(struct nouveau_vp3_decoder *dec,
                            enum pipe_video_profile profile,
                            unsigned chipset);
  
+void
+nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec);
+
+void
+nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
+                     const void *const *data, const unsigned *num_bytes);
+
  uint32_t
-nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
-                struct nouveau_vp3_video_buffer *target,
-                unsigned comm_seq, unsigned num_buffers,
-                const void *const *data, const unsigned *num_bytes);
+nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc);
  
  void
  nouveau_vp3_vp_caps(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c

index 692772e..a3d07de 100644 (file)
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
@@ -230,20 +230,58 @@ nouveau_vp3_fill_picparm_h264_bsp(struct nouveau_vp3_decoder *dec,
     return caps | 3;
  }
  
+static inline struct strparm_bsp *strparm_bsp(struct nouveau_vp3_decoder *dec)
+{
+   unsigned comm_seq = dec->fence_seq;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   return (struct strparm_bsp *)(bsp_bo->map + 0x100);
+}
+
+void
+nouveau_vp3_bsp_begin(struct nouveau_vp3_decoder *dec)
+{
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+
+   dec->bsp_ptr = (void *)str_bsp;
+   memset(str_bsp, 0, 0x80);
+   dec->bsp_ptr += 0x100;
+   /* Reserved for picparm_vp */
+   dec->bsp_ptr += 0x300;
+   /* Reserved for comm */
+#if !NOUVEAU_VP3_DEBUG_FENCE
+   memset(dec->bsp_ptr, 0, 0x200);
+#endif
+   dec->bsp_ptr += 0x200;
+}
+
+void
+nouveau_vp3_bsp_next(struct nouveau_vp3_decoder *dec, unsigned num_buffers,
+                     const void *const *data, const unsigned *num_bytes)
+{
+#ifndef NDEBUG
+   unsigned comm_seq = dec->fence_seq;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+#endif
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+   int i;
+
+   for (i = 0; i < num_buffers; ++i) {
+      assert(bsp_bo->size >= str_bsp->w0[0] + num_bytes[i]);
+      memcpy(dec->bsp_ptr, data[i], num_bytes[i]);
+      dec->bsp_ptr += num_bytes[i];
+      str_bsp->w0[0] += num_bytes[i];
+   }
+}
+
  uint32_t
-nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
-                struct nouveau_vp3_video_buffer *target,
-                unsigned comm_seq, unsigned num_buffers,
-                const void *const *data, const unsigned *num_bytes)
+nouveau_vp3_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc)
  {
     enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   unsigned comm_seq = dec->fence_seq;
     struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
-   char *bsp;
     uint32_t endmarker, caps;
-   struct strparm_bsp *str_bsp;
-   int i;
-
-   bsp = bsp_bo->map;
+   struct strparm_bsp *str_bsp = strparm_bsp(dec);
+   char *bsp = bsp_bo->map;
     /*
      * 0x000..0x100: picparm_bsp
      * 0x200..0x500: picparm_vp
@@ -277,34 +315,21 @@ nouveau_vp3_bsp(struct nouveau_vp3_decoder *dec,  union pipe_desc desc,
     caps |= 1 << 17; // enable watchdog
     caps |= 0 << 18; // do not report error to VP, so it can continue decoding what we have
     caps |= 0 << 19; // if enabled, use crypto crap?
-   bsp += 0x100;
  
-   str_bsp = (struct strparm_bsp *)bsp;
-   memset(str_bsp, 0, 0x80);
-   str_bsp->w0[0] = 16;
+   str_bsp = strparm_bsp(dec);
     str_bsp->w1[0] = 0x1;
-   bsp += 0x100;
-   /* Reserved for picparm_vp */
-   bsp += 0x300;
-   /* Reserved for comm */
-#if !NOUVEAU_VP3_DEBUG_FENCE
-   memset(bsp, 0, 0x200);
-#endif
-   bsp += 0x200;
-   for (i = 0; i < num_buffers; ++i) {
-      memcpy(bsp, data[i], num_bytes[i]);
-      bsp += num_bytes[i];
-      str_bsp->w0[0] += num_bytes[i];
-   }
  
     /* Append end sequence */
-   *(uint32_t *)bsp = endmarker;
-   bsp += 4;
-   *(uint32_t *)bsp = 0x00000000;
-   bsp += 4;
-   *(uint32_t *)bsp = endmarker;
-   bsp += 4;
-   *(uint32_t *)bsp = 0x00000000;
+   *(uint32_t *)dec->bsp_ptr = endmarker;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = 0x00000000;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = endmarker;
+   dec->bsp_ptr += 4;
+   *(uint32_t *)dec->bsp_ptr = 0x00000000;
+   str_bsp->w0[0] += 16;
+
+   dec->bsp_ptr = NULL;
  
     return caps;
  }
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h

index 1319c32..f13988e 100644 (file)
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -6,6 +6,7 @@
  
  #include "pipe/p_defines.h"
  
+#include <drm.h>
  #include <nouveau.h>
  
  #ifndef NV04_PFIFO_MAX_PACKET_LEN
@@ -79,13 +80,13 @@ nouveau_screen_transfer_flags(unsigned pipe)
     return flags;
  }
  
-extern struct pipe_screen *
+extern struct nouveau_screen *
  nv30_screen_create(struct nouveau_device *);
  
-extern struct pipe_screen *
+extern struct nouveau_screen *
  nv50_screen_create(struct nouveau_device *);
  
-extern struct pipe_screen *
+extern struct nouveau_screen *
  nvc0_screen_create(struct nouveau_device *);
  
  #endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c

index 098d6e4..7b0d074 100644 (file)
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -208,17 +208,16 @@ nv30_render_release_vertices(struct vbuf_render *render)
  
  static const struct {
     unsigned emit;
-   unsigned interp;
     unsigned vp30;
     unsigned vp40;
     unsigned ow40;
  } vroute [] = {
-   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, INTERP_PERSPECTIVE, 0, 0, 0x00000000 },
-   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, INTERP_LINEAR     , 3, 1, 0x00000001 },
-   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, INTERP_LINEAR     , 1, 3, 0x00000004 },
-   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 },
-   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, INTERP_POS  , 6, 6, 0x00000020 },
-   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
+   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, 0, 0, 0x00000000 },
+   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, 3, 1, 0x00000001 },
+   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, 1, 3, 0x00000004 },
+   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, 5, 5, 0x00000010 },
+   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, 6, 6, 0x00000020 },
+   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, 8, 7, 0x00004000 },
  };
  
  static bool
@@ -247,7 +246,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
     if (emit == EMIT_OMIT)
        return false;
  
-   draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
+   draw_emit_vertex_attr(vinfo, emit, attrib);
     format = draw_translate_vinfo_format(emit);
  
     r->vtxfmt[attrib] = nv30_vtxfmt(&screen->base.base, format)->hw;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c

index 154c3d3..933330f 100644 (file)
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -157,6 +157,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
     case PIPE_CAP_USER_VERTEX_BUFFERS:
     case PIPE_CAP_COMPUTE:
     case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
     case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
     case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
     case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -174,6 +176,13 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
     case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
        return 0;
  
     case PIPE_CAP_VENDOR_ID:
@@ -265,6 +274,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
        case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
           return 0;
        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
           return 32;
@@ -308,6 +318,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
        case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
           return 0;
        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
           return 32;
@@ -413,23 +424,20 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
  #define FAIL_SCREEN_INIT(str, err)                    \
     do {                                               \
        NOUVEAU_ERR(str, err);                          \
-      nv30_screen_destroy(pscreen);                   \
-      return NULL;                                    \
+      screen->base.base.context_create = NULL;        \
+      return &screen->base;                           \
     } while(0)
  
-struct pipe_screen *
+struct nouveau_screen *
  nv30_screen_create(struct nouveau_device *dev)
  {
-   struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+   struct nv30_screen *screen;
     struct pipe_screen *pscreen;
     struct nouveau_pushbuf *push;
     struct nv04_fifo *fifo;
     unsigned oclass = 0;
     int ret, i;
  
-   if (!screen)
-      return NULL;
-
     switch (dev->chipset & 0xf0) {
     case 0x30:
        if (RANKINE_0397_CHIPSET & (1 << (dev->chipset & 0x0f)))
@@ -458,10 +466,16 @@ nv30_screen_create(struct nouveau_device *dev)
  
     if (!oclass) {
        NOUVEAU_ERR("unknown 3d class for 0x%02x\n", dev->chipset);
-      FREE(screen);
        return NULL;
     }
  
+   screen = CALLOC_STRUCT(nv30_screen);
+   if (!screen)
+      return NULL;
+
+   pscreen = &screen->base.base;
+   pscreen->destroy = nv30_screen_destroy;
+
     /*
      * Some modern apps try to use msaa without keeping in mind the
      * restrictions on videomem of older cards. Resulting in dmesg saying:
@@ -479,8 +493,6 @@ nv30_screen_create(struct nouveau_device *dev)
     if (screen->max_sample_count > 4)
        screen->max_sample_count = 4;
  
-   pscreen = &screen->base.base;
-   pscreen->destroy = nv30_screen_destroy;
     pscreen->get_param = nv30_screen_get_param;
     pscreen->get_paramf = nv30_screen_get_paramf;
     pscreen->get_shader_param = nv30_screen_get_shader_param;
@@ -693,5 +705,5 @@ nv30_screen_create(struct nouveau_device *dev)
     nouveau_pushbuf_kick(push, push->channel);
  
     nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
-   return pscreen;
+   return &screen->base;
  }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h

index 2cebcd9..712d00e 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -134,9 +134,11 @@ struct nv50_context {
     struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
     uint16_t constbuf_dirty[3];
     uint16_t constbuf_valid[3];
+   uint16_t constbuf_coherent[3];
  
     struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
     unsigned num_vtxbufs;
+   uint32_t vtxbufs_coherent;
     struct pipe_index_buffer idxbuf;
     uint32_t vbo_fifo; /* bitmask of vertex elements to be pushed to FIFO */
     uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
@@ -148,6 +150,7 @@ struct nv50_context {
  
     struct pipe_sampler_view *textures[3][PIPE_MAX_SAMPLERS];
     unsigned num_textures[3];
+   uint32_t textures_coherent[3];
     struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS];
     unsigned num_samplers[3];
  
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c

index 812d10c..7450119 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -336,9 +336,10 @@ nv50_miptree_create(struct pipe_screen *pscreen,
                      const struct pipe_resource *templ)
  {
     struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nouveau_drm *drm = nouveau_screen(pscreen)->drm;
     struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
     struct pipe_resource *pt = &mt->base.base;
-   bool compressed = dev->drm_version >= 0x01000101;
+   bool compressed = drm->version >= 0x01000101;
     int ret;
     union nouveau_bo_config bo_config;
     uint32_t bo_flags;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c

index a4b8ddf..888d62e 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -148,7 +148,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
     for (m = 0, i = 0; i < info->numInputs; ++i) {
        switch (info->in[i].sn) {
        case TGSI_SEMANTIC_POSITION:
-      case TGSI_SEMANTIC_FACE:
           continue;
        default:
           m += info->in[i].flat ? 0 : 1;
@@ -166,9 +165,6 @@ nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
           for (c = 0; c < 4; ++c)
              if (info->in[i].mask & (1 << c))
                 info->in[i].slot[c] = nintp++;
-      } else
-      if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
-         info->in[i].slot[0] = 255;
        } else {
           unsigned j = info->in[i].flat ? m++ : n++;
  
@@ -335,7 +331,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
     info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
     info->bin.source = (void *)prog->pipe.tokens;
  
-   info->io.ucpCBSlot = 15;
+   info->io.auxCBSlot = 15;
     info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
     info->io.genUserClip = prog->vp.clpd_nr;
  
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c

index b6ebbbf..cccd3b7 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -113,6 +113,12 @@ static void
  nv50_hw_destroy_query(struct nv50_context *nv50, struct nv50_query *q)
  {
     struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nv50, hq);
+      return;
+   }
+
     nv50_hw_query_allocate(nv50, q, 0);
     nouveau_fence_ref(NULL, &hq->fence);
     FREE(hq);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c

index d1bccb9..4a605f2 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
@@ -71,7 +71,8 @@ nv50_hw_metric_destroy_query(struct nv50_context *nv50,
     unsigned i;
  
     for (i = 0; i < hmq->num_queries; i++)
-      hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
+      if (hmq->queries[i]->funcs->destroy_query)
+         hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
     FREE(hmq);
  }
  
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c

index 8453ce7..79c7023 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
@@ -153,7 +153,9 @@ static void
  nv50_hw_sm_destroy_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
  {
     struct nv50_query *q = &hq->base;
-   q->funcs->destroy_query(nv50, q);
+   nv50_hw_query_allocate(nv50, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
  }
  
  static boolean
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c

index 1e4b75f..712835c 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_CLEAR_TEXTURE:
     case PIPE_CAP_COMPUTE:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
        return 1;
     case PIPE_CAP_SEAMLESS_CUBE_MAP:
        return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -212,11 +213,19 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
     case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
     case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
     case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
     case PIPE_CAP_VERTEXID_NOBASE:
     case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
     case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
     case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
     case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
        return 0;
  
     case PIPE_CAP_VENDOR_ID:
@@ -300,6 +309,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
     case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
        return 0;
     case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
        return 32;
@@ -405,6 +415,11 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
  
     if (screen->blitter)
        nv50_blitter_destroy(screen);
+   if (screen->pm.prog) {
+      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
+      nv50_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
+   }
  
     nouveau_bo_ref(NULL, &screen->code);
     nouveau_bo_ref(NULL, &screen->tls_bo);
@@ -518,11 +533,11 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
     }
  
     BEGIN_NV04(push, NV50_3D(ZETA_COMP_ENABLE), 1);
-   PUSH_DATA(push, screen->base.device->drm_version >= 0x01000101);
+   PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
  
     BEGIN_NV04(push, NV50_3D(RT_COMP_ENABLE(0)), 8);
     for (i = 0; i < 8; ++i)
-      PUSH_DATA(push, screen->base.device->drm_version >= 0x01000101);
+      PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
  
     BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
     PUSH_DATA (push, 1);
@@ -747,7 +762,7 @@ int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space)
     return 1;
  }
  
-struct pipe_screen *
+struct nouveau_screen *
  nv50_screen_create(struct nouveau_device *dev)
  {
     struct nv50_screen *screen;
@@ -762,6 +777,7 @@ nv50_screen_create(struct nouveau_device *dev)
     if (!screen)
        return NULL;
     pscreen = &screen->base.base;
+   pscreen->destroy = nv50_screen_destroy;
  
     ret = nouveau_screen_init(&screen->base, dev);
     if (ret) {
@@ -782,7 +798,6 @@ nv50_screen_create(struct nouveau_device *dev)
  
     chan = screen->base.channel;
  
-   pscreen->destroy = nv50_screen_destroy;
     pscreen->context_create = nv50_create;
     pscreen->is_format_supported = nv50_screen_is_format_supported;
     pscreen->get_param = nv50_screen_get_param;
@@ -961,11 +976,11 @@ nv50_screen_create(struct nouveau_device *dev)
  
     nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
  
-   return pscreen;
+   return &screen->base;
  
  fail:
-   nv50_screen_destroy(pscreen);
-   return NULL;
+   screen->base.base.context_create = NULL;
+   return &screen->base;
  }
  
  int
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c

index de65597..cb04043 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -664,6 +664,17 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
        if (old)
           nv50_screen_tic_unlock(nv50->screen, old);
  
+      if (views[i] && views[i]->texture) {
+         struct pipe_resource *res = views[i]->texture;
+         if (res->target == PIPE_BUFFER &&
+             (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+            nv50->textures_coherent[s] |= 1 << i;
+         else
+            nv50->textures_coherent[s] &= ~(1 << i);
+      } else {
+         nv50->textures_coherent[s] &= ~(1 << i);
+      }
+
        pipe_sampler_view_reference(&nv50->textures[s][i], views[i]);
     }
  
@@ -847,13 +858,19 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
        nv50->constbuf[s][i].u.data = cb->user_buffer;
        nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
        nv50->constbuf_valid[s] |= 1 << i;
+      nv50->constbuf_coherent[s] &= ~(1 << i);
     } else
     if (res) {
        nv50->constbuf[s][i].offset = cb->buffer_offset;
        nv50->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
        nv50->constbuf_valid[s] |= 1 << i;
+      if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         nv50->constbuf_coherent[s] |= 1 << i;
+      else
+         nv50->constbuf_coherent[s] &= ~(1 << i);
     } else {
        nv50->constbuf_valid[s] &= ~(1 << i);
+      nv50->constbuf_coherent[s] &= ~(1 << i);
     }
     nv50->constbuf_dirty[s] |= 1 << i;
  
@@ -1003,6 +1020,7 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
     if (!vb) {
        nv50->vbo_user &= ~(((1ull << count) - 1) << start_slot);
        nv50->vbo_constant &= ~(((1ull << count) - 1) << start_slot);
+      nv50->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot);
        return;
     }
  
@@ -1015,9 +1033,16 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
              nv50->vbo_constant |= 1 << dst_index;
           else
              nv50->vbo_constant &= ~(1 << dst_index);
+         nv50->vtxbufs_coherent &= ~(1 << dst_index);
        } else {
           nv50->vbo_user &= ~(1 << dst_index);
           nv50->vbo_constant &= ~(1 << dst_index);
+
+         if (vb[i].buffer &&
+             vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+            nv50->vtxbufs_coherent |= (1 << dst_index);
+         else
+            nv50->vtxbufs_coherent &= ~(1 << dst_index);
        }
     }
  }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c

index 85878d5..60fa2bc 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -91,6 +91,9 @@ nv50_vertex_state_create(struct pipe_context *pipe,
              }
              so->element[i].state = nv50_format_table[fmt].vtx;
              so->need_conversion = true;
+            pipe_debug_message(&nouveau_context(pipe)->debug, FALLBACK,
+                               "Converting vertex element %d, no hw format %s",
+                               i, util_format_name(ve->src_format));
          }
          so->element[i].state |= i;
  
@@ -633,8 +636,8 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
           BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
           PUSH_DATA (push, prim);
  
-         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
           nouveau_pushbuf_space(push, 8, 0, 1);
+         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
  
           switch (index_size) {
           case 4:
@@ -762,7 +765,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
     struct nv50_context *nv50 = nv50_context(pipe);
     struct nouveau_pushbuf *push = nv50->base.pushbuf;
     bool tex_dirty = false;
-   int i, s;
+   int s;
  
     /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
     nv50->vb_elt_first = info->min_index + info->index_bias;
@@ -791,27 +794,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
  
     push->kick_notify = nv50_draw_vbo_kick_notify;
  
-   /* TODO: Instead of iterating over all the buffer resources looking for
-    * coherent buffers, keep track of a context-wide count.
-    */
     for (s = 0; s < 3 && !nv50->cb_dirty; ++s) {
-      uint32_t valid = nv50->constbuf_valid[s];
-
-      while (valid && !nv50->cb_dirty) {
-         const unsigned i = ffs(valid) - 1;
-         struct pipe_resource *res;
-
-         valid &= ~(1 << i);
-         if (nv50->constbuf[s][i].user)
-            continue;
-
-         res = nv50->constbuf[s][i].u.buf;
-         if (!res)
-            continue;
-
-         if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nv50->cb_dirty = true;
-      }
+      if (nv50->constbuf_coherent[s])
+         nv50->cb_dirty = true;
     }
  
     /* If there are any coherent constbufs, flush the cache */
@@ -822,15 +807,10 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
     }
  
     for (s = 0; s < 3 && !tex_dirty; ++s) {
-      for (i = 0; i < nv50->num_textures[s] && !tex_dirty; ++i) {
-         if (!nv50->textures[s][i] ||
-             nv50->textures[s][i]->texture->target != PIPE_BUFFER)
-            continue;
-         if (nv50->textures[s][i]->texture->flags &
-             PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            tex_dirty = true;
-      }
+      if (nv50->textures_coherent[s])
+         tex_dirty = true;
     }
+
     if (tex_dirty) {
        BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
        PUSH_DATA (push, 0x20);
@@ -850,12 +830,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
        PUSH_DATA (push, info->start_instance);
     }
  
-   for (i = 0; i < nv50->num_vtxbufs && !nv50->base.vbo_dirty; ++i) {
-      if (!nv50->vtxbuf[i].buffer)
-         continue;
-      if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv50->base.vbo_dirty = true;
-   }
+   nv50->base.vbo_dirty |= !!nv50->vtxbufs_coherent;
  
     if (nv50->base.vbo_dirty) {
        BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video.c b/src/gallium/drivers/nouveau/nv50/nv98_video.c

index 20ea547..177a7e0 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv98_video.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video.c
@@ -25,6 +25,8 @@
  #include "util/u_sampler.h"
  #include "util/u_format.h"
  
+#include <nvif/class.h>
+
  static void
  nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                                struct pipe_video_buffer *video_target,
@@ -56,6 +58,28 @@ nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder,
     nv98_decoder_ppp(dec, desc, target, comm_seq);
  }
  
+static const struct nouveau_mclass
+nv98_decoder_msvld[] = {
+   { G98_MSVLD, -1 },
+   { IGT21A_MSVLD, -1 },
+   { GT212_MSVLD, -1 },
+   {}
+};
+
+static const struct nouveau_mclass
+nv98_decoder_mspdec[] = {
+   { G98_MSPDEC, -1 },
+   { GT212_MSPDEC, -1 },
+   {}
+};
+
+static const struct nouveau_mclass
+nv98_decoder_msppp[] = {
+   { G98_MSPPP, -1 },
+   { GT212_MSPPP, -1 },
+   {}
+};
+
  struct pipe_video_codec *
  nv98_create_decoder(struct pipe_context *context,
                      const struct pipe_video_codec *templ)
@@ -103,12 +127,33 @@ nv98_create_decoder(struct pipe_context *context,
     }
     push = dec->pushbuf;
  
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[0], 0x390b1, 0x85b1, NULL, 0, &dec->bsp);
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[1], 0x190b2, 0x85b2, NULL, 0, &dec->vp);
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[2], 0x290b3, 0x85b3, NULL, 0, &dec->ppp);
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[0], nv98_decoder_msvld);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[0], 0xbeef85b1,
+                                  nv98_decoder_msvld[ret].oclass, NULL, 0,
+                                  &dec->bsp);
+      }
+   }
+
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[1], nv98_decoder_mspdec);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[1], 0xbeef85b2,
+                                  nv98_decoder_mspdec[ret].oclass, NULL, 0,
+                                  &dec->vp);
+      }
+   }
+
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[2], nv98_decoder_msppp);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[2], 0xbeef85b3,
+                                  nv98_decoder_msppp[ret].oclass, NULL, 0,
+                                  &dec->ppp);
+      }
+   }
+
     if (ret)
        goto fail;
  
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c

index dbde1bf..4fe0e05 100644 (file)
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -77,7 +77,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
        bsp_size += (1 << 20) - 1;
        bsp_size &= ~((1 << 20) - 1);
  
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
        if (ret) {
           debug_printf("reallocating bsp %u -> %u failed with %i\n",
                        bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -90,7 +90,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
     if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
        struct nouveau_bo *tmp_bo = NULL;
  
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
        if (ret) {
           debug_printf("reallocating inter %u -> %u failed with %i\n",
                        inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
@@ -106,8 +106,9 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
        return -1;
     }
  
-   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes);
+   nouveau_vp3_bsp_begin(dec);
+   nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+   caps = nouveau_vp3_bsp_end(dec, desc);
  
     nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
  
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme

index b2060d1..4daa57d 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -230,27 +230,43 @@ locn_0f_ts:
   * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE.
   *
   * arg     = mode
- * parm[0] = count
- * parm[1] = instance_count
- * parm[2] = start
- * parm[3] = index_bias
- * parm[4] = start_instance
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2 + 5n + 0] = count
+ * parm[2 + 5n + 1] = instance_count
+ * parm[2 + 5n + 2] = start
+ * parm[2 + 5n + 3] = index_bias
+ * parm[2 + 5n + 4] = start_instance
+ *
+ * SCRATCH[0] = saved VB_ELEMENT_BASE
+ * SCRATCH[1] = saved VB_INSTANCE_BASE
   */
  .section #mme9097_draw_elts_indirect
+   read $r6 0x50d /* VB_ELEMENT_BASE */
+   read $r7 0x50e /* VB_INSTANCE_BASE */
+   maddr 0x1d00
+   send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */
+   send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+dei_draw_again:
     parm $r3 /* count */
     parm $r2 /* instance_count */
     parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
     parm $r4 send $r4 /* index_bias, send start */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
     braz $r2 #dei_end
-   parm $r5 /* start_instance */
-   read $r6 0x50d /* VB_ELEMENT_BASE */
-   read $r7 0x50e /* VB_INSTANCE_BASE */
+   parm $r5 send $r4 /* start_instance, send index_bias */
+   send $r5 /* send start_instance */
+   send $r6 /* draw id */
     maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
     send $r4
     send $r5
     maddr 0x446
     send $r4
     mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
  dei_again:
     maddr 0x586 /* VERTEX_BEGIN_GL */
     send $r1 /* mode */
@@ -260,46 +276,218 @@ dei_again:
     maddrsend 0x585 /* VERTEX_END_GL */
     branz $r2 #dei_again
     mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+dei_end:
+   mov $r7 (add $r7 -1)
+   branz $r7 #dei_draw_again
+   mov $r6 (add $r6 1)
+   read $r6 0xd00
+   read $r7 0xd01
     maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
     send $r6
     send $r7
     exit maddr 0x446
     send $r6
-dei_end:
-   exit
-   nop
  
  /* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT:
   *
   * NOTE: Saves and restores VB_INSTANCE_BASE.
   *
   * arg     = mode
- * parm[0] = count
- * parm[1] = instance_count
- * parm[2] = start
- * parm[3] = start_instance
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2 + 4n + 0] = count
+ * parm[2 + 4n + 1] = instance_count
+ * parm[2 + 4n + 2] = start
+ * parm[2 + 4n + 3] = start_instance
   */
  .section #mme9097_draw_arrays_indirect
+   read $r5 0x50e /* VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+dai_draw_again:
     parm $r2 /* count */
     parm $r3 /* instance_count */
     parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
-   parm $r4 send $r4 /* start_instance */
     braz $r3 #dai_end
-   read $r6 0x50e /* VB_INSTANCE_BASE */
+   parm $r4 send $r4 /* start_instance */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   send 0x0 /* send 0 as base_vertex */
+   send $r4 /* send start_instance */
+   send $r6 /* draw id */
     maddr 0x50e /* VB_INSTANCE_BASE */
-   mov $r5 0x1
     send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
  dai_again:
     maddr 0x586 /* VERTEX_BEGIN_GL */
     send $r1 /* mode */
     maddr 0x35e /* VERTEX_BUFFER_COUNT */
     send $r2
-   mov $r3 (sub $r3 $r5)
+   mov $r3 (sub $r3 $r4)
     maddrsend 0x585 /* VERTEX_END_GL */
     branz $r3 #dai_again
-   mov $r1 (extrinsrt $r1 $r5 0 1 26) /* set INSTANCE_NEXT */
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+dai_end:
+   mov $r7 (add $r7 -1)
+   branz $r7 #dai_draw_again
+   mov $r6 (add $r6 1)
     exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
+   send $r5
+
+/* NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT
+ *
+ * NOTE: Saves and restores VB_ELEMENT,INSTANCE_BASE.
+ * Forcefully sets VERTEX_ID_BASE to the value of VB_ELEMENT_BASE.
+ *
+ * arg     = mode
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2] = totaldraws
+ * parm[3 + 5n + 0] = count
+ * parm[3 + 5n + 1] = instance_count
+ * parm[3 + 5n + 2] = start
+ * parm[3 + 5n + 3] = index_bias
+ * parm[3 + 5n + 4] = start_instance
+ *
+ * SCRATCH[0] = saved VB_ELEMENT_BASE
+ * SCRATCH[1] = saved VB_INSTANCE_BASE
+ * SCRATCH[2] = draws left
+ */
+.section #mme9097_draw_elts_indirect_count
+   read $r6 0x50d /* VB_ELEMENT_BASE */
+   read $r7 0x50e /* VB_INSTANCE_BASE */
+   maddr 0x1d00
+   send $r6 /* SCRATCH[0] = VB_ELEMENT_BASE */
+   send $r7 /* SCRATCH[1] = VB_INSTANCE_BASE */
+   parm $r6 /* start_drawid */
+   parm $r7 /* numparams */
+   parm $r5 /* totaldraws */
+   mov $r5 (sub $r5 $r6) /* draws left */
+   braz $r5 #deic_runout
+   mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */
+   branz $r3 #deic_runout
+   send $r5
+deic_draw_again:
+   parm $r3 /* count */
+   parm $r2 /* instance_count */
+   parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */
+   parm $r4 send $r4 /* index_bias, send start */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   braz $r2 #deic_end
+   parm $r5 send $r4 /* start_instance, send index_bias */
+   send $r5 /* send start_instance */
+   send $r6 /* draw id */
+   maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
+   send $r4
+   send $r5
+   maddr 0x446
+   send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
+deic_again:
+   maddr 0x586 /* VERTEX_BEGIN_GL */
+   send $r1 /* mode */
+   maddr 0x5f8 /* INDEX_BATCH_COUNT */
+   send $r3 /* count */
+   mov $r2 (sub $r2 $r4)
+   maddrsend 0x585 /* VERTEX_END_GL */
+   branz $r2 #deic_again
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+deic_end:
+   read $r5 0xd02
+   mov $r5 (add $r5 -1)
+   braz $r5 #deic_runout_check
+   mov $r7 (add $r7 -1)
+   maddr 0xd02
+   send $r5
+   branz $r7 #deic_draw_again
+   mov $r6 (add $r6 1)
+deic_restore:
+   read $r6 0xd00
+   read $r7 0xd01
+   maddr 0x150d /* VB_ELEMENT,INSTANCE_BASE */
     send $r6
-dai_end:
-   exit
-   nop
+   send $r7
+   exit maddr 0x446
+   send $r6
+deic_runout:
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   mov $r7 (add $r7 -1)
+deic_runout_check:
+   branz annul $r7 #deic_runout
+   bra annul #deic_restore
+
+/* NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT:
+ *
+ * NOTE: Saves and restores VB_INSTANCE_BASE.
+ *
+ * arg     = mode
+ * parm[0] = start_drawid
+ * parm[1] = numparams
+ * parm[2] = totaldraws
+ * parm[3 + 4n + 0] = count
+ * parm[3 + 4n + 1] = instance_count
+ * parm[3 + 4n + 2] = start
+ * parm[3 + 4n + 3] = start_instance
+ *
+ * SCRATCH[0] = VB_INSTANCE_BASE
+ */
+.section #mme9097_draw_arrays_indirect_count
+   read $r5 0x50e /* VB_INSTANCE_BASE */
+   maddr 0xd00
+   parm $r6 send $r5 /* start_drawid, save VB_INSTANCE_BASE */
+   parm $r7 /* numparams */
+   parm $r5 /* totaldraws */
+   mov $r5 (sub $r5 $r6) /* draws left */
+   braz $r5 #daic_runout
+   mov $r3 (extrinsrt 0x0 $r5 31 1 0) /* extract high bit */
+   branz annul $r3 #daic_runout
+daic_draw_again:
+   parm $r2 /* count */
+   parm $r3 /* instance_count */
+   parm $r4 maddr 0x35d /* VERTEX_BUFFER_FIRST, start */
+   braz $r3 #daic_end
+   parm $r4 send $r4 /* start_instance */
+   maddr 0x18e3 /* CB_POS */
+   send 0x180 /* 256 + 128 */
+   send 0x0 /* send 0 as base_vertex */
+   send $r4 /* send start_instance */
+   send $r6 /* draw id */
+   maddr 0x50e /* VB_INSTANCE_BASE */
+   send $r4
+   mov $r4 0x1
+   mov $r1 (extrinsrt $r1 0x0 0 1 26) /* clear INSTANCE_NEXT */
+daic_again:
+   maddr 0x586 /* VERTEX_BEGIN_GL */
+   send $r1 /* mode */
+   maddr 0x35e /* VERTEX_BUFFER_COUNT */
+   send $r2
+   mov $r3 (sub $r3 $r4)
+   maddrsend 0x585 /* VERTEX_END_GL */
+   branz $r3 #daic_again
+   mov $r1 (extrinsrt $r1 $r4 0 1 26) /* set INSTANCE_NEXT */
+daic_end:
+   mov $r5 (add $r5 -1)
+   braz $r5 #daic_runout_check
+   mov $r7 (add $r7 -1)
+   branz $r7 #daic_draw_again
+   mov $r6 (add $r6 1)
+daic_restore:
+   read $r5 0xd00
+   exit maddr 0x50e /* VB_INSTANCE_BASE to restore */
+   send $r5
+daic_runout:
+   parm $r2
+   parm $r2
+   parm $r2
+   parm $r2
+   mov $r7 (add $r7 -1)
+daic_runout_check:
+   branz annul $r7 #daic_runout
+   bra annul #daic_restore
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h

index bac9042..bf8625e 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -125,22 +125,33 @@ uint32_t mme9097_tep_select[] = {
  };
  
  uint32_t mme9097_draw_elts_indirect[] = {
+       0x01434615,
+/* 0x0007: dei_draw_again */
+       0x01438715,
+       0x07400021,
+       0x00003041,
+       0x00003841,
+       0x00000601,
+/* 0x0018: dei_again */
+       0x00000701,
         0x00000301,
+/* 0x0020: dei_end */
         0x00000201,
         0x017dc451,
-/* 0x000e: dei_again */
         0x00002431,
-       0x0005d007,
-       0x00000501,
-/* 0x001b: dei_end */
-       0x01434615,
-       0x01438715,
+       0x0638c021,
+       0x00600041,
+       0x0004d007,
+       0x00002531,
+       0x00002841,
+       0x00003041,
         0x05434021,
         0x00002041,
         0x00002841,
         0x01118021,
         0x00002041,
         0x00004411,
+       0xd0400912,
         0x01618021,
         0x00000841,
         0x017e0021,
@@ -149,37 +160,175 @@ uint32_t mme9097_draw_elts_indirect[] = {
         0x01614071,
         0xfffe9017,
         0xd0410912,
+       0xffffff11,
+       0xfff9b817,
+       0x00007611,
+       0x03400615,
+       0x03404715,
         0x05434021,
         0x00003041,
         0x00003841,
         0x011180a1,
         0x00003041,
-       0x00000091,
-       0x00000011,
  };
  
  uint32_t mme9097_draw_arrays_indirect[] = {
+/* 0x0003: dai_draw_again */
+       0x01438515,
+       0x00000601,
+       0x00000701,
         0x00000201,
+/* 0x0011: dai_again */
         0x00000301,
-/* 0x0009: dai_again */
         0x00d74451,
+/* 0x0019: dai_end */
+       0x0004d807,
         0x00002431,
-/* 0x0013: dai_end */
-       0x0003d807,
-       0x01438615,
+       0x0638c021,
+       0x00600041,
+       0x00000041,
+       0x00002041,
+       0x00003041,
         0x01438021,
-       0x00004511,
         0x00002041,
+       0x00004411,
+       0xd0400912,
         0x01618021,
         0x00000841,
         0x00d78021,
         0x00001041,
-       0x00055b10,
+       0x00051b10,
         0x01614071,
         0xfffe9817,
-       0xd0414912,
+       0xd0410912,
+       0xffffff11,
+       0xfffa7817,
+       0x00007611,
         0x014380a1,
+       0x00002841,
+};
+
+uint32_t mme9097_draw_elts_indirect_count[] = {
+       0x01434615,
+       0x01438715,
+       0x07400021,
+/* 0x000d: deic_draw_again */
         0x00003041,
-       0x00000091,
-       0x00000011,
+       0x00003841,
+       0x00000601,
+       0x00000701,
+/* 0x001e: deic_again */
+       0x00000501,
+       0x0005ad10,
+/* 0x0026: deic_end */
+       0x000b2807,
+       0x007f4312,
+/* 0x002e: deic_restore */
+       0x000a9817,
+       0x00002841,
+/* 0x0035: deic_runout */
+       0x00000301,
+/* 0x003b: deic_runout_check */
+       0x00000201,
+       0x017dc451,
+       0x00002431,
+       0x0638c021,
+       0x00600041,
+       0x0004d007,
+       0x00002531,
+       0x00002841,
+       0x00003041,
+       0x05434021,
+       0x00002041,
+       0x00002841,
+       0x01118021,
+       0x00002041,
+       0x00004411,
+       0xd0400912,
+       0x01618021,
+       0x00000841,
+       0x017e0021,
+       0x00001841,
+       0x00051210,
+       0x01614071,
+       0xfffe9017,
+       0xd0410912,
+       0x03408515,
+       0xffffed11,
+       0x0004e807,
+       0xffffff11,
+       0x03408021,
+       0x00002841,
+       0xfff87817,
+       0x00007611,
+       0x03400615,
+       0x03404715,
+       0x05434021,
+       0x00003041,
+       0x00003841,
+       0x011180a1,
+       0x00003041,
+       0x00000201,
+       0x00000201,
+       0x00000201,
+       0x00000201,
+       0x00000201,
+       0xffffff11,
+       0xfffeb837,
+       0xfffc8027,
+};
+
+uint32_t mme9097_draw_arrays_indirect_count[] = {
+       0x01438515,
+       0x03400021,
+/* 0x0009: daic_draw_again */
+       0x00002e31,
+       0x00000701,
+       0x00000501,
+/* 0x0017: daic_again */
+       0x0005ad10,
+       0x00086807,
+/* 0x001f: daic_end */
+       0x007f4312,
+       0x0007d837,
+/* 0x0024: daic_restore */
+/* 0x0027: daic_runout */
+       0x00000201,
+       0x00000301,
+/* 0x002c: daic_runout_check */
+       0x00d74451,
+       0x0004d807,
+       0x00002431,
+       0x0638c021,
+       0x00600041,
+       0x00000041,
+       0x00002041,
+       0x00003041,
+       0x01438021,
+       0x00002041,
+       0x00004411,
+       0xd0400912,
+       0x01618021,
+       0x00000841,
+       0x00d78021,
+       0x00001041,
+       0x00051b10,
+       0x01614071,
+       0xfffe9817,
+       0xd0410912,
+       0xffffed11,
+       0x00032807,
+       0xffffff11,
+       0xfff9f817,
+       0x00007611,
+       0x03400515,
+       0x014380a1,
+       0x00002841,
+       0x00000201,
+       0x00000201,
+       0x00000201,
+       0x00000201,
+       0xffffff11,
+       0xfffef837,
+       0xfffdc027,
  };
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c

index 2e7c790..7180434 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -195,8 +195,10 @@ nvc0_launch_grid(struct pipe_context *pipe,
     int ret;
  
     ret = !nvc0_compute_state_validate(nvc0);
-   if (ret)
-      goto out;
+   if (ret) {
+      NOUVEAU_ERR("Failed to launch grid !\n");
+      return;
+   }
  
     nvc0_compute_upload_input(nvc0, input);
  
@@ -246,15 +248,11 @@ nvc0_launch_grid(struct pipe_context *pipe,
     /* rebind all the 3D constant buffers
      * (looks like binding a CB on COMPUTE clobbers 3D state) */
     nvc0->dirty |= NVC0_NEW_CONSTBUF;
-   for (s = 0; s < 6; s++) {
+   for (s = 0; s < 5; s++) {
        for (i = 0; i < NVC0_MAX_PIPE_CONSTBUFS; i++)
           if (nvc0->constbuf[s][i].u.buf)
              nvc0->constbuf_dirty[s] |= 1 << i;
     }
     memset(nvc0->state.uniform_buffer_bound, 0,
            sizeof(nvc0->state.uniform_buffer_bound));
-
-out:
-   if (ret)
-      NOUVEAU_ERR("Failed to launch grid !\n");
  }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h

index 39b73ec..1219548 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -134,10 +134,12 @@ struct nvc0_context {
     struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
     uint16_t constbuf_dirty[6];
     uint16_t constbuf_valid[6];
+   uint16_t constbuf_coherent[6];
     bool cb_dirty;
  
     struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
     unsigned num_vtxbufs;
+   uint32_t vtxbufs_coherent;
     struct pipe_index_buffer idxbuf;
     uint32_t constant_vbos;
     uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
@@ -149,6 +151,7 @@ struct nvc0_context {
     struct pipe_sampler_view *textures[6][PIPE_MAX_SAMPLERS];
     unsigned num_textures[6];
     uint32_t textures_dirty[6];
+   uint32_t textures_coherent[6];
     struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS];
     unsigned num_samplers[6];
     uint16_t samplers_dirty[6];
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h

index bf2798a..27c026b 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
@@ -29,4 +29,8 @@
  
  #define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT                   0x00003840
  
+#define NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT               0x00003848
+
+#define NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT             0x00003850
+
  #endif /* __NVC0_MACROS_H__ */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c

index 15991c3..ed1ac48 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -248,9 +248,10 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
                      const struct pipe_resource *templ)
  {
     struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nouveau_drm *drm = nouveau_screen(pscreen)->drm;
     struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
     struct pipe_resource *pt = &mt->base.base;
-   bool compressed = dev->drm_version >= 0x01000101;
+   bool compressed = drm->version >= 0x01000101;
     int ret;
     union nouveau_bo_config bo_config;
     uint32_t bo_flags;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c

index 67a25ac..c3b5362 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -55,7 +55,6 @@ nvc0_shader_input_address(unsigned sn, unsigned si)
     case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
     case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
     case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
-   case TGSI_SEMANTIC_FACE:         return 0x3fc;
     default:
        assert(!"invalid TGSI input semantic");
        return ~0;
@@ -285,8 +284,6 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
        break;
     case PIPE_PRIM_TRIANGLES:
        tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
-      if (info->prop.tp.winding > 0)
-         tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
        break;
     case PIPE_PRIM_QUADS:
        tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
@@ -295,6 +292,10 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
        tp->tp.tess_mode = ~0;
        return;
     }
+
+   if (info->prop.tp.winding > 0)
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
+
     if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS)
        tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
  
@@ -533,8 +534,9 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
     info->bin.source = (void *)prog->pipe.tokens;
  
     info->io.genUserClip = prog->vp.num_ucps;
+   info->io.auxCBSlot = 15;
     info->io.ucpBase = 256;
-   info->io.ucpCBSlot = 15;
+   info->io.drawInfoBase = 256 + 128;
  
     if (prog->type == PIPE_SHADER_COMPUTE) {
        if (chipset >= NVISA_GK104_CHIPSET) {
@@ -583,6 +585,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
     prog->num_barriers = info->numBarriers;
  
     prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+   prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters;
  
     if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
        info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h

index 9c45e7b..8b8d221 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -42,6 +42,7 @@ struct nvc0_program {
        uint8_t num_ucps; /* also set to max if ClipDistance is used */
        uint8_t edgeflag; /* attribute index of edgeflag input */
        bool need_vertex_id;
+      bool need_draw_parameters;
     } vp;
     struct {
        uint8_t early_z;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c

index 3845d61..7497317 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -184,7 +184,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
     count++;
  #endif
  
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
        if (screen->compute) {
           if (screen->base.class_3d == NVE4_3D_CLASS) {
              count += 2;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c

index 90ee82f..1bed016 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -116,6 +116,12 @@ static void
  nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
  {
     struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nvc0, hq);
+      return;
+   }
+
     nvc0_hw_query_allocate(nvc0, q, 0);
     nouveau_fence_ref(NULL, &hq->fence);
     FREE(hq);
@@ -464,10 +470,7 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
  {
     struct nvc0_hw_query *hq = nvc0_hw_query(q);
  
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-
     PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
-   nouveau_pushbuf_space(push, 0, 0, 1);
     nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
                          NVC0_IB_ENTRY_1_NO_PREFETCH);
  }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c

index 12fb609..7a64b69 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -293,7 +293,8 @@ nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
     unsigned i;
  
     for (i = 0; i < hmq->num_queries; i++)
-      hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
+      if (hmq->queries[i]->funcs->destroy_query)
+         hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
     FREE(hmq);
  }
  
@@ -420,7 +421,10 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
  {
     switch (hq->base.type - NVE4_HW_METRIC_QUERY(0)) {
     case NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
-      return sm20_hw_metric_calc_result(hq, res64);
+      /* (active_warps / active_cycles) / max. number of warps on a MP */
+      if (res64[1])
+         return (res64[0] / (double)res64[1]) / 64;
+      break;
     case NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
        return sm20_hw_metric_calc_result(hq, res64);
     case NVE4_HW_METRIC_QUERY_INST_ISSUED:
@@ -561,7 +565,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
     uint16_t class_3d = screen->base.class_3d;
     int count = 0;
  
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
        if (screen->compute) {
           if (screen->base.class_3d == NVE4_3D_CLASS) {
              count += NVE4_HW_METRIC_QUERY_COUNT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c

index 7d1e75f..721857e 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -782,7 +782,9 @@ static void
  nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
  {
     struct nvc0_query *q = &hq->base;
-   q->funcs->destroy_query(nvc0, q);
+   nvc0_hw_query_allocate(nvc0, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
  }
  
  static boolean
@@ -1075,17 +1077,6 @@ nve4_hw_sm_query_read_data(uint32_t count[32][8],
     return true;
  }
  
-/* Metric calculations:
- * sum(x) ... sum of x over all MPs
- * avg(x) ... average of x over all MPs
- *
- * IPC              : sum(inst_executed) / clock
- * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
- * MP_OCCUPANCY     : avg((active_warps / 64) / active_cycles)
- * MP_EFFICIENCY    : avg(active_cycles / clock)
- *
- * NOTE: Interpretation of IPC requires knowledge of MP count.
- */
  static boolean
  nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
                              boolean wait, union pipe_query_result *result)
@@ -1130,7 +1121,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
     struct nvc0_hw_query *hq;
     unsigned space;
  
-   if (nvc0->screen->base.device->drm_version < 0x01000101)
+   if (nvc0->screen->base.drm->version < 0x01000101)
        return NULL;
  
     if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
@@ -1225,7 +1216,7 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
  {
     int count = 0;
  
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
        if (screen->compute) {
           if (screen->base.class_3d == NVE4_3D_CLASS) {
              count += NVE4_HW_SM_QUERY_COUNT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c

index 461fcaa..ccf96fb 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -22,6 +22,7 @@
  
  #include <xf86drm.h>
  #include <nouveau_drm.h>
+#include <nvif/class.h>
  #include "util/u_format.h"
  #include "util/u_format_s3tc.h"
  #include "pipe/p_screen.h"
@@ -183,6 +184,11 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
     case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
        return 1;
     case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
        return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -205,6 +211,10 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
     case PIPE_CAP_VERTEXID_NOBASE:
     case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
     case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
        return 0;
  
     case PIPE_CAP_VENDOR_ID:
@@ -310,6 +320,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
     case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
        return 0;
     case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
        return 16; /* would be 32 in linked (OpenGL-style) mode */
@@ -428,6 +439,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
     if (screen->pm.prog) {
        screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
        nvc0_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
     }
  
     nouveau_bo_ref(NULL, &screen->text);
@@ -617,11 +629,10 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
  #define FAIL_SCREEN_INIT(str, err)                    \
     do {                                               \
        NOUVEAU_ERR(str, err);                          \
-      nvc0_screen_destroy(pscreen);                   \
-      return NULL;                                    \
+      goto fail;                                      \
     } while(0)
  
-struct pipe_screen *
+struct nouveau_screen *
  nvc0_screen_create(struct nouveau_device *dev)
  {
     struct nvc0_screen *screen;
@@ -650,6 +661,7 @@ nvc0_screen_create(struct nouveau_device *dev)
     if (!screen)
        return NULL;
     pscreen = &screen->base.base;
+   pscreen->destroy = nvc0_screen_destroy;
  
     ret = nouveau_screen_init(&screen->base, dev);
     if (ret) {
@@ -672,7 +684,6 @@ nvc0_screen_create(struct nouveau_device *dev)
        screen->base.vidmem_bindings = 0;
     }
  
-   pscreen->destroy = nvc0_screen_destroy;
     pscreen->context_create = nvc0_create;
     pscreen->is_format_supported = nvc0_screen_is_format_supported;
     pscreen->get_param = nvc0_screen_get_param;
@@ -687,7 +698,7 @@ nvc0_screen_create(struct nouveau_device *dev)
     screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported;
  
     flags = NOUVEAU_BO_GART | NOUVEAU_BO_MAP;
-   if (dev->drm_version >= 0x01000202)
+   if (screen->base.drm->version >= 0x01000202)
        flags |= NOUVEAU_BO_COHERENT;
  
     ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo);
@@ -699,12 +710,13 @@ nvc0_screen_create(struct nouveau_device *dev)
     screen->base.fence.update = nvc0_screen_fence_update;
  
  
-   ret = nouveau_object_new(chan,
-                            (dev->chipset < 0xe0) ? 0x1f906e : 0x906e, 0x906e,
-                            NULL, 0, &screen->nvsw);
+   ret = nouveau_object_new(chan, (dev->chipset < 0xe0) ? 0x1f906e : 0x906e,
+                            NVIF_CLASS_SW_GF100, NULL, 0, &screen->nvsw);
     if (ret)
        FAIL_SCREEN_INIT("Error creating SW object: %d\n", ret);
  
+   BEGIN_NVC0(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->nvsw->handle);
  
     switch (dev->chipset & ~0xf) {
     case 0x110:
@@ -811,10 +823,11 @@ nvc0_screen_create(struct nouveau_device *dev)
        PUSH_DATA (push, 0x17);
     }
  
-   IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE), dev->drm_version >= 0x01000101);
+   IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE),
+                    screen->base.drm->version >= 0x01000101);
     BEGIN_NVC0(push, NVC0_3D(RT_COMP_ENABLE(0)), 8);
     for (i = 0; i < 8; ++i)
-           PUSH_DATA(push, dev->drm_version >= 0x01000101);
+           PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
  
     BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
     PUSH_DATA (push, 1);
@@ -910,7 +923,7 @@ nvc0_screen_create(struct nouveau_device *dev)
     PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
     PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
  
-   if (dev->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
        ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
        if (ret) {
           NOUVEAU_ERR("NOUVEAU_GETPARAM_GRAPH_UNITS failed.\n");
@@ -1022,6 +1035,8 @@ nvc0_screen_create(struct nouveau_device *dev)
     MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back);
     MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect);
     MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect);
+   MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
+   MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
  
     BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
     PUSH_DATA (push, 1);
@@ -1061,11 +1076,11 @@ nvc0_screen_create(struct nouveau_device *dev)
  
     nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
  
-   return pscreen;
+   return &screen->base;
  
  fail:
-   nvc0_screen_destroy(pscreen);
-   return NULL;
+   screen->base.base.context_create = NULL;
+   return &screen->base;
  }
  
  int
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c

index 5e84ca9..dc02b01 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -317,6 +317,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
  
        if (!targ->clean)
           nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq));
+      nouveau_pushbuf_space(push, 0, 0, 1);
        BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
        PUSH_DATA (push, 1);
        PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c

index 41a824a..24a6c22 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -554,6 +554,17 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
           continue;
        nvc0->textures_dirty[s] |= 1 << i;
  
+      if (views[i] && views[i]->texture) {
+         struct pipe_resource *res = views[i]->texture;
+         if (res->target == PIPE_BUFFER &&
+             (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+            nvc0->textures_coherent[s] |= 1 << i;
+         else
+            nvc0->textures_coherent[s] &= ~(1 << i);
+      } else {
+         nvc0->textures_coherent[s] &= ~(1 << i);
+      }
+
        if (old) {
           nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i));
           nvc0_screen_tic_unlock(nvc0->screen, old);
@@ -596,6 +607,17 @@ nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s,
              continue;
           nvc0->textures_dirty[s] |= 1 << i;
  
+         if (views[p] && views[p]->texture) {
+            struct pipe_resource *res = views[p]->texture;
+            if (res->target == PIPE_BUFFER &&
+                (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+               nvc0->textures_coherent[s] |= 1 << i;
+            else
+               nvc0->textures_coherent[s] &= ~(1 << i);
+         } else {
+            nvc0->textures_coherent[s] &= ~(1 << i);
+         }
+
           if (nvc0->textures[s][i]) {
              struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
              nouveau_bufctx_reset(bctx, bin + i);
@@ -842,14 +864,20 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
        nvc0->constbuf[s][i].u.data = cb->user_buffer;
        nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
        nvc0->constbuf_valid[s] |= 1 << i;
+      nvc0->constbuf_coherent[s] &= ~(1 << i);
     } else
     if (cb) {
        nvc0->constbuf[s][i].offset = cb->buffer_offset;
        nvc0->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
        nvc0->constbuf_valid[s] |= 1 << i;
+      if (res && res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+         nvc0->constbuf_coherent[s] |= 1 << i;
+      else
+         nvc0->constbuf_coherent[s] &= ~(1 << i);
     }
     else {
        nvc0->constbuf_valid[s] &= ~(1 << i);
+      nvc0->constbuf_coherent[s] &= ~(1 << i);
     }
  }
  
@@ -1009,6 +1037,7 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
      if (!vb) {
         nvc0->vbo_user &= ~(((1ull << count) - 1) << start_slot);
         nvc0->constant_vbos &= ~(((1ull << count) - 1) << start_slot);
+       nvc0->vtxbufs_coherent &= ~(((1ull << count) - 1) << start_slot);
         return;
      }
  
@@ -1021,9 +1050,16 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
               nvc0->constant_vbos |= 1 << dst_index;
            else
               nvc0->constant_vbos &= ~(1 << dst_index);
+          nvc0->vtxbufs_coherent &= ~(1 << dst_index);
         } else {
            nvc0->vbo_user &= ~(1 << dst_index);
            nvc0->constant_vbos &= ~(1 << dst_index);
+
+          if (vb[i].buffer &&
+              vb[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+             nvc0->vtxbufs_coherent |= (1 << dst_index);
+          else
+             nvc0->vtxbufs_coherent &= ~(1 << dst_index);
         }
      }
  }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c

index f8e1efb..4e43c4e 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1030,9 +1030,11 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit)
        nvc0->base.pipe.render_condition(&nvc0->base.pipe, nvc0->cond_query,
                                         nvc0->cond_cond, nvc0->cond_mode);
  
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP);
     nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
     nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0));
     nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1));
+   nouveau_scratch_done(&nvc0->base);
  
     nvc0->dirty = blit->saved.dirty |
        (NVC0_NEW_FRAMEBUFFER | NVC0_NEW_SCISSOR | NVC0_NEW_SAMPLE_MASK |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c

index c464904..ad79d1c 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -95,6 +95,9 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
              }
              so->element[i].state = nvc0_format_table[fmt].vtx;
              so->need_conversion = true;
+            pipe_debug_message(&nouveau_context(pipe)->debug, FALLBACK,
+                               "Converting vertex element %d, no hw format %s",
+                               i, util_format_name(ve->src_format));
          }
          size = util_format_get_blocksize(fmt);
  
@@ -784,7 +787,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
     }
  
     while (num_instances--) {
-      PUSH_SPACE(push, 8);
+      nouveau_pushbuf_space(push, 9, 0, 1);
        BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
        PUSH_DATA (push, mode);
        BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1);
@@ -804,19 +807,31 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
  {
     struct nouveau_pushbuf *push = nvc0->base.pushbuf;
     struct nv04_resource *buf = nv04_resource(info->indirect);
-   unsigned size;
-   const uint32_t offset = buf->offset + info->indirect_offset;
+   struct nv04_resource *buf_count = nv04_resource(info->indirect_params);
+   unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
+   uint32_t offset = buf->offset + info->indirect_offset;
  
     /* must make FIFO wait for engines idle before continuing to process */
-   if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
+   if ((buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) ||
+       (buf_count && buf_count->fence_wr &&
+        !nouveau_fence_signalled(buf_count->fence_wr))) {
        IMMED_NVC0(push, SUBC_3D(NV10_SUBCHAN_REF_CNT), 0);
+   }
+
+   /* Queue things up to let the macros write params to the driver constbuf */
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, 512);
+   PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+   PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
  
-   PUSH_SPACE(push, 8);
     if (info->indexed) {
        assert(nvc0->idxbuf.buffer);
        assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
-      size = 5 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ELEMENTS_INDIRECT), 1 + size / 4);
+      size = 5;
+      if (buf_count)
+         macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT;
+      else
+         macro = NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT;
     } else {
        if (nvc0->state.index_bias) {
           /* index_bias is implied 0 if !info->indexed (really ?) */
@@ -824,15 +839,59 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
           IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
           nvc0->state.index_bias = 0;
        }
-      size = 4 * 4;
-      BEGIN_1IC0(push, NVC0_3D(MACRO_DRAW_ARRAYS_INDIRECT), 1 + size / 4);
-   }
-   PUSH_DATA(push, nvc0_prim_gl(info->mode));
-#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
-   nouveau_pushbuf_space(push, 0, 0, 1);
-   nouveau_pushbuf_data(push,
-                        buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
+      size = 4;
+      if (buf_count)
+         macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT;
+      else
+         macro = NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT;
+   }
+
+   /* If the stride is not the natural stride, we have to stick a separate
+    * push data reference for each draw. Otherwise it can all go in as one.
+    * Of course there is a maximum packet size, so we have to break things up
+    * along those borders as well.
+    */
+   while (count) {
+      unsigned draws = count, pushes, i;
+      if (info->indirect_stride == size * 4) {
+         draws = MIN2(draws, (NV04_PFIFO_MAX_PACKET_LEN - 4) / size);
+         pushes = 1;
+      } else {
+         draws = MIN2(draws, 32);
+         pushes = draws;
+      }
+
+      nouveau_pushbuf_space(push, 16, 0, pushes + !!buf_count);
+      PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
+      if (buf_count)
+         PUSH_REFN(push, buf_count->bo, NOUVEAU_BO_RD | buf_count->domain);
+      PUSH_DATA(push,
+                NVC0_FIFO_PKHDR_1I(0, macro, 3 + !!buf_count + draws * size));
+      PUSH_DATA(push, nvc0_prim_gl(info->mode));
+      PUSH_DATA(push, drawid);
+      PUSH_DATA(push, draws);
+      if (buf_count) {
+         nouveau_pushbuf_data(push,
+                              buf_count->bo,
+                              buf_count->offset + info->indirect_params_offset,
+                              NVC0_IB_ENTRY_1_NO_PREFETCH | 4);
+      }
+      if (pushes == 1) {
+         nouveau_pushbuf_data(push,
+                              buf->bo, offset,
+                              NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4 * draws));
+         offset += draws * info->indirect_stride;
+      } else {
+         for (i = 0; i < pushes; i++) {
+            nouveau_pushbuf_data(push,
+                                 buf->bo, offset,
+                                 NVC0_IB_ENTRY_1_NO_PREFETCH | (size * 4));
+            offset += info->indirect_stride;
+         }
+      }
+      count -= draws;
+      drawid += draws;
+   }
  }
  
  static inline void
@@ -861,7 +920,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
  {
     struct nvc0_context *nvc0 = nvc0_context(pipe);
     struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   int i, s;
+   int s;
  
     /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
     nvc0->vb_elt_first = info->min_index + info->index_bias;
@@ -898,29 +957,25 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
     /* 8 as minimum to avoid immediate double validation of new buffers */
     nvc0_state_validate(nvc0, ~0, 8);
  
+   if (nvc0->vertprog->vp.need_draw_parameters) {
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, 512);
+      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (5 << 16) + (0 << 9));
+      if (!info->indirect) {
+         BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+         PUSH_DATA (push, 256 + 128);
+         PUSH_DATA (push, info->index_bias);
+         PUSH_DATA (push, info->start_instance);
+         PUSH_DATA (push, info->drawid);
+      }
+   }
+
     push->kick_notify = nvc0_draw_vbo_kick_notify;
  
-   /* TODO: Instead of iterating over all the buffer resources looking for
-    * coherent buffers, keep track of a context-wide count.
-    */
     for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
-      uint32_t valid = nvc0->constbuf_valid[s];
-
-      while (valid && !nvc0->cb_dirty) {
-         const unsigned i = ffs(valid) - 1;
-         struct pipe_resource *res;
-
-         valid &= ~(1 << i);
-         if (nvc0->constbuf[s][i].user)
-            continue;
-
-         res = nvc0->constbuf[s][i].u.buf;
-         if (!res)
-            continue;
-
-         if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nvc0->cb_dirty = true;
-      }
+      if (nvc0->constbuf_coherent[s])
+         nvc0->cb_dirty = true;
     }
  
     if (nvc0->cb_dirty) {
@@ -929,14 +984,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
     }
  
     for (s = 0; s < 5; ++s) {
+      if (!nvc0->textures_coherent[s])
+         continue;
+
        for (int i = 0; i < nvc0->num_textures[s]; ++i) {
           struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
-         struct pipe_resource *res;
-         if (!tic)
-            continue;
-         res = nvc0->textures[s][i]->texture;
-         if (res->target != PIPE_BUFFER ||
-             !(res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT))
+         if (!(nvc0->textures_coherent[s] & (1 << i)))
              continue;
  
           BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -962,12 +1015,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
        PUSH_DATA (push, info->start_instance);
     }
  
-   for (i = 0; i < nvc0->num_vtxbufs && !nvc0->base.vbo_dirty; ++i) {
-      if (!nvc0->vtxbuf[i].buffer)
-         continue;
-      if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nvc0->base.vbo_dirty = true;
-   }
+   nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
  
     if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer &&
         nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c

index 48ffac1..a9fd1d2 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -26,6 +26,24 @@
  #include "util/u_format.h"
  
  static void
+nvc0_decoder_begin_frame(struct pipe_video_codec *decoder,
+                         struct pipe_video_buffer *target,
+                         struct pipe_picture_desc *picture)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   uint32_t comm_seq = ++dec->fence_seq;
+   unsigned ret = 0;
+
+   assert(dec);
+   assert(target);
+   assert(target->buffer_format == PIPE_FORMAT_NV12);
+
+   ret = nvc0_decoder_bsp_begin(dec, comm_seq);
+
+   assert(ret == 2);
+}
+
+static void
  nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                                struct pipe_video_buffer *video_target,
                                struct pipe_picture_desc *picture,
@@ -34,8 +52,24 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                                const unsigned *num_bytes)
  {
     struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   uint32_t comm_seq = dec->fence_seq;
+   unsigned ret = 0;
+
+   assert(decoder);
+
+   ret = nvc0_decoder_bsp_next(dec, comm_seq, num_buffers, data, num_bytes);
+
+   assert(ret == 2);
+}
+
+static void
+nvc0_decoder_end_frame(struct pipe_video_codec *decoder,
+                       struct pipe_video_buffer *video_target,
+                       struct pipe_picture_desc *picture)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
     struct nouveau_vp3_video_buffer *target = (struct nouveau_vp3_video_buffer *)video_target;
-   uint32_t comm_seq = ++dec->fence_seq;
+   uint32_t comm_seq = dec->fence_seq;
     union pipe_desc desc;
  
     unsigned vp_caps, is_ref, ret;
@@ -43,11 +77,7 @@ nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
  
     desc.base = picture;
  
-   assert(target->base.buffer_format == PIPE_FORMAT_NV12);
-
-   ret = nvc0_decoder_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes,
-                          &vp_caps, &is_ref, refs);
+   ret = nvc0_decoder_bsp_end(dec, desc, target, comm_seq, &vp_caps, &is_ref, refs);
  
     /* did we decode bitstream correctly? */
     assert(ret == 2);
@@ -164,14 +194,19 @@ nvc0_create_decoder(struct pipe_context *context,
     PUSH_DATA (push[2], dec->ppp->handle);
  
     dec->base.context = context;
+   dec->base.begin_frame = nvc0_decoder_begin_frame;
     dec->base.decode_bitstream = nvc0_decoder_decode_bitstream;
+   dec->base.end_frame = nvc0_decoder_end_frame;
  
     for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
        ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                             0, 1 << 20, &cfg, &dec->bsp_bo[i]);
-   if (!ret)
+   if (!ret) {
+      /* total fudge factor... just has to be bigger for higher bitrates? */
+      unsigned inter_size = align(templ->width * templ->height * 2, 4 << 20);
        ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
-                           0x100, 4 << 20, &cfg, &dec->inter_bo[0]);
+                           0x100, inter_size, &cfg, &dec->inter_bo[0]);
+   }
     if (!ret) {
        ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                             0x100, dec->inter_bo[0]->size, &cfg,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h

index 9ee0280..cf3c942 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
@@ -30,12 +30,18 @@
  #include "util/u_video.h"
  
  extern unsigned
-nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
-                 struct nouveau_vp3_video_buffer *target,
-                 unsigned comm_seq, unsigned num_buffers,
-                 const void *const *data, const unsigned *num_bytes,
-                 unsigned *vp_caps, unsigned *is_ref,
-                 struct nouveau_vp3_video_buffer *refs[16]);
+nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq);
+
+extern unsigned
+nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
+                      unsigned comm_seq, unsigned num_buffers,
+                      const void *const *data, const unsigned *num_bytes);
+
+extern unsigned
+nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                     struct nouveau_vp3_video_buffer *target,
+                     unsigned comm_seq, unsigned *vp_caps, unsigned *is_ref,
+                     struct nouveau_vp3_video_buffer *refs[16]);
  
  extern void
  nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c

index 4392f62..c53f946 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -32,40 +32,34 @@ static void dump_comm_bsp(struct comm *comm)
  #endif
  
  unsigned
-nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
-                 struct nouveau_vp3_video_buffer *target,
-                 unsigned comm_seq, unsigned num_buffers,
-                 const void *const *data, const unsigned *num_bytes,
-                 unsigned *vp_caps, unsigned *is_ref,
-                 struct nouveau_vp3_video_buffer *refs[16])
+nvc0_decoder_bsp_begin(struct nouveau_vp3_decoder *dec, unsigned comm_seq)
  {
-   struct nouveau_pushbuf *push = dec->pushbuf[0];
-   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
-   uint32_t bsp_addr, comm_addr, inter_addr;
-   uint32_t slice_size, bucket_size, ring_size, bsp_size;
-   uint32_t caps, i;
-   int ret;
     struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
-   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
-   unsigned fence_extra = 0;
-   struct nouveau_pushbuf_refn bo_refs[] = {
-      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
-      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
-#if NOUVEAU_VP3_DEBUG_FENCE
-      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
-#endif
-      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
-   };
-   int num_refs = ARRAY_SIZE(bo_refs);
+   unsigned ret = 0;
  
-   if (!dec->bitplane_bo)
-      num_refs--;
+   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
+   if (ret) {
+      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+      return -1;
+   }
  
-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
+   nouveau_vp3_bsp_begin(dec);
+
+   return 2;
+}
  
-   bsp_size = NOUVEAU_VP3_BSP_RESERVED_SIZE;
+unsigned
+nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
+                      unsigned comm_seq, unsigned num_buffers,
+                      const void *const *data, const unsigned *num_bytes)
+{
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   uint32_t bsp_size = 0;
+   uint32_t i = 0;
+   unsigned ret = 0;
+
+   bsp_size = dec->bsp_ptr - (char *)bsp_bo->map;
     for (i = 0; i < num_buffers; i++)
        bsp_size += num_bytes[i];
     bsp_size += 256; /* the 4 end markers */
@@ -81,14 +75,29 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
        bsp_size += (1 << 20) - 1;
        bsp_size &= ~((1 << 20) - 1);
  
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
        if (ret) {
           debug_printf("reallocating bsp %u -> %u failed with %i\n",
                        bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
           return -1;
        }
+
+      ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret) {
+         debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+         return -1;
+      }
+
+      /* Preserve previous buffer. */
+      /* TODO: offload this copy to the GPU, as otherwise we're reading and
+       * writing to VRAM. */
+      memcpy(tmp_bo->map, bsp_bo->map, bsp_bo->size);
+
+      /* update position to current chunk */
+      dec->bsp_ptr = tmp_bo->map + (dec->bsp_ptr - (char *)bsp_bo->map);
+
        nouveau_bo_ref(NULL, &bsp_bo);
-      bo_refs[0].bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo;
+      dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH] = bsp_bo = tmp_bo;
     }
  
     if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
@@ -98,24 +107,61 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
        cfg.nvc0.tile_mode = 0x10;
        cfg.nvc0.memtype = 0xfe;
  
-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
        if (ret) {
           debug_printf("reallocating inter %u -> %u failed with %i\n",
                        inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
           return -1;
        }
+
+      ret = nouveau_bo_map(tmp_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret) {
+         debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+         return -1;
+      }
+
        nouveau_bo_ref(NULL, &inter_bo);
-      bo_refs[1].bo = dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo;
+      dec->inter_bo[comm_seq & 1] = inter_bo = tmp_bo;
     }
  
-   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
-   if (ret) {
-      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
-      return -1;
-   }
+   nouveau_vp3_bsp_next(dec, num_buffers, data, num_bytes);
+
+   return 2;
+}
+
+
+unsigned
+nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                     struct nouveau_vp3_video_buffer *target, unsigned comm_seq,
+                     unsigned *vp_caps, unsigned *is_ref,
+                     struct nouveau_vp3_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[0];
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   uint32_t bsp_addr, comm_addr, inter_addr;
+   uint32_t slice_size, bucket_size, ring_size;
+   uint32_t caps;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   unsigned fence_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = ARRAY_SIZE(bo_refs);
+
+   if (!dec->bitplane_bo)
+      num_refs--;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
  
-   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
-                          num_buffers, data, num_bytes);
+   caps = nouveau_vp3_bsp_end(dec, desc);
  
     nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
  
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h

index 4ea8ca3..79abe78 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
@@ -68,6 +68,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
  #define SUBC_SW(m) 7, (m)
  
  #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE
+#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
  
  static inline uint32_t
  NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
diff --git a/src/gallium/drivers/r300/compiler/r500_fragprog.c b/src/gallium/drivers/r300/compiler/r500_fragprog.c

index 88aad8a..4c415af 100644 (file)
--- a/src/gallium/drivers/r300/compiler/r500_fragprog.c
+++ b/src/gallium/drivers/r300/compiler/r500_fragprog.c
@@ -384,7 +384,7 @@ void r500FragmentProgramDump(struct radeon_compiler *c, void *user)
      case R500_INST_TYPE_OUT: str = "OUT"; break;
      case R500_INST_TYPE_FC: str = "FC"; break;
      case R500_INST_TYPE_TEX: str = "TEX"; break;
-    };
+    }
      fprintf(stderr,"%s %s %s %s %s ", str,
             inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "",
             inst & R500_INST_LAST ? "LAST" : "",
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c

index b393769..82ba043 100644 (file)
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -421,8 +421,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
      r300->context.create_video_codec = vl_create_decoder;
      r300->context.create_video_buffer = vl_video_buffer_create;
  
-    r300->uploader = u_upload_create(&r300->context, 256 * 1024, 4,
-                                     PIPE_BIND_CUSTOM);
+    r300->uploader = u_upload_create(&r300->context, 256 * 1024,
+                                     PIPE_BIND_CUSTOM, PIPE_USAGE_STREAM);
  
      r300->blitter = util_blitter_create(&r300->context);
      if (r300->blitter == NULL)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c

index b482fa1..7eda675 100644 (file)
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -1010,7 +1010,7 @@ static void r300_render_draw_elements(struct vbuf_render* render,
      CS_LOCALS(r300);
      DBG(r300, DBG_DRAW, "r300: render_draw_elements (count: %d)\n", count);
  
-    u_upload_data(r300->uploader, 0, count * 2, indices,
+    u_upload_data(r300->uploader, 0, count * 2, 4, indices,
                    &index_buffer_offset, &index_buffer);
      if (!index_buffer) {
          return;
diff --git a/src/gallium/drivers/r300/r300_render_translate.c b/src/gallium/drivers/r300/r300_render_translate.c

index caeeec0..7221211 100644 (file)
--- a/src/gallium/drivers/r300/r300_render_translate.c
+++ b/src/gallium/drivers/r300/r300_render_translate.c
@@ -37,7 +37,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
      switch (*index_size) {
      case 1:
          *out_buffer = NULL;
-        u_upload_alloc(r300->uploader, 0, count * 2,
+        u_upload_alloc(r300->uploader, 0, count * 2, 4,
                         &out_offset, out_buffer, &ptr);
  
          util_shorten_ubyte_elts_to_userptr(
@@ -51,7 +51,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
      case 2:
          if (index_offset) {
              *out_buffer = NULL;
-            u_upload_alloc(r300->uploader, 0, count * 2,
+            u_upload_alloc(r300->uploader, 0, count * 2, 4,
                             &out_offset, out_buffer, &ptr);
  
              util_rebuild_ushort_elts_to_userptr(&r300->context, ib,
@@ -65,7 +65,7 @@ void r300_translate_index_buffer(struct r300_context *r300,
      case 4:
          if (index_offset) {
              *out_buffer = NULL;
-            u_upload_alloc(r300->uploader, 0, count * 4,
+            u_upload_alloc(r300->uploader, 0, count * 4, 4,
                             &out_offset, out_buffer, &ptr);
  
              util_rebuild_uint_elts_to_userptr(&r300->context, ib,
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c

index 606e25f..8823b8d 100644 (file)
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -183,6 +183,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
          case PIPE_CAP_SAMPLE_SHADING:
          case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
          case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
          case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
          case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
          case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -200,6 +202,13 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
          case PIPE_CAP_SHAREABLE_SHADERS:
          case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
          case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+        case PIPE_CAP_INVALIDATE_BUFFER:
+        case PIPE_CAP_GENERATE_MIPMAP:
              return 0;
  
          /* SWTCL-only features. */
@@ -304,6 +313,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
          case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
          case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
          case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
              return 0;
          case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
              return 32;
@@ -362,6 +372,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
          case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
          case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
          case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
              return 0;
          case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
              return 32;
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c

index 737a6f5..42c8e3a 100644 (file)
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -42,7 +42,7 @@ void r300_upload_index_buffer(struct r300_context *r300,
      *index_buffer = NULL;
  
      u_upload_data(r300->uploader,
-                  0, count * index_size,
+                  0, count * index_size, 4,
                    ptr + (*start * index_size),
                    &index_offset,
                    index_buffer);
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c

index da472f4..741e263 100644 (file)
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -52,7 +52,6 @@ enum r300_rs_col_write_type {
  
  static void r300_draw_emit_attrib(struct r300_context* r300,
                                    enum attrib_emit emit,
-                                  enum interp_mode interp,
                                    int index)
  {
      struct r300_vertex_shader* vs = r300->vs_state.state;
@@ -62,7 +61,7 @@ static void r300_draw_emit_attrib(struct r300_context* r300,
      output = draw_find_shader_output(r300->draw,
                                       info->output_semantic_name[index],
                                       info->output_semantic_index[index]);
-    draw_emit_vertex_attr(&r300->vertex_info, emit, interp, output);
+    draw_emit_vertex_attr(&r300->vertex_info, emit, output);
  }
  
  static void r300_draw_emit_all_attribs(struct r300_context* r300)
@@ -73,31 +72,27 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
  
      /* Position. */
      if (vs_outputs->pos != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->pos);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->pos);
      } else {
          assert(0);
      }
  
      /* Point size. */
      if (vs_outputs->psize != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, INTERP_POS,
-                              vs_outputs->psize);
+        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, vs_outputs->psize);
      }
  
      /* Colors. */
      for (i = 0; i < ATTR_COLOR_COUNT; i++) {
          if (vs_outputs->color[i] != ATTR_UNUSED) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
-                                  vs_outputs->color[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->color[i]);
          }
      }
  
      /* Back-face colors. */
      for (i = 0; i < ATTR_COLOR_COUNT; i++) {
          if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
-                                  vs_outputs->bcolor[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->bcolor[i]);
          }
      }
  
@@ -108,16 +103,14 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
      for (i = 0; i < ATTR_GENERIC_COUNT && gen_count < 8; i++) {
          if (vs_outputs->generic[i] != ATTR_UNUSED &&
              !(r300->sprite_coord_enable & (1 << i))) {
-            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                                  vs_outputs->generic[i]);
+            r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->generic[i]);
              gen_count++;
          }
      }
  
      /* Fog coordinates. */
      if (gen_count < 8 && vs_outputs->fog != ATTR_UNUSED) {
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->fog);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->fog);
          gen_count++;
      }
  
@@ -125,8 +118,7 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
      if (r300_fs(r300)->shader->inputs.wpos != ATTR_UNUSED && gen_count < 8) {
          DBG(r300, DBG_SWTCL, "draw_emit_attrib: WPOS, index: %i\n",
              vs_outputs->wpos);
-        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
-                              vs_outputs->wpos);
+        r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->wpos);
      }
  }
  
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c

index d83eb17..20945ec 100644 (file)
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -600,7 +600,7 @@ static void evergreen_launch_grid(
                             ctx->screen->has_compressed_msaa_texturing);
                  bc->type = TGSI_PROCESSOR_COMPUTE;
                  bc->isa = ctx->isa;
-                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
+                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);
  
                  if (dump && !sb_disasm) {
                          r600_bytecode_disasm(bc);
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c

index 02d0c7f..9dfb849 100644 (file)
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1956,7 +1956,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
  
                 if (!gs_ring_buffer) {
                         radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4,
-                                                   ALIGN_DIVUP(cb->buffer_size >> 4, 16), pkt_flags);
+                                                   DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags);
                         radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8,
                                                     pkt_flags);
                 }
@@ -2414,7 +2414,7 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
         struct r600_command_buffer *cb = &rctx->start_cs_cmd;
         int tmp, i;
  
-       r600_init_command_buffer(cb, 342);
+       r600_init_command_buffer(cb, 338);
  
         /* This must be first. */
         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
@@ -2468,10 +2468,6 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
  
         r600_store_context_reg(cb, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0);
  
-       r600_store_context_reg_seq(cb, R_028AB4_VGT_REUSE_OFF, 2);
-       r600_store_value(cb, 0); /* R_028AB4_VGT_REUSE_OFF */
-       r600_store_value(cb, 0); /* R_028AB8_VGT_VTX_CNT_EN */
-
         r600_store_config_reg(cb, R_008A14_PA_CL_ENHANCE, (3 << 1) | 1);
  
         r600_store_context_reg_seq(cb, CM_R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
@@ -2671,7 +2667,7 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)
                 return;
         }
  
-       r600_init_command_buffer(cb, 342);
+       r600_init_command_buffer(cb, 338);
  
         /* This must be first. */
         r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
@@ -2896,10 +2892,6 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)
         r600_store_value(cb, 0); /* R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL */
         r600_store_value(cb, 0); /* R_028A40_VGT_GS_MODE */
  
-       r600_store_context_reg_seq(cb, R_028AB4_VGT_REUSE_OFF, 2);
-       r600_store_value(cb, 0); /* R_028AB4_VGT_REUSE_OFF */
-       r600_store_value(cb, 0); /* R_028AB8_VGT_VTX_CNT_EN */
-
         r600_store_config_reg(cb, R_008A14_PA_CL_ENHANCE, (3 << 1) | 1);
  
          r600_store_context_reg(cb, R_0288F0_SQ_VTX_SEMANTIC_CLEAR, ~0);
@@ -3731,7 +3723,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
         r600_init_atom(rctx, &rctx->blend_color.atom, id++, r600_emit_blend_color, 6);
         r600_init_atom(rctx, &rctx->blend_state.atom, id++, r600_emit_cso_state, 0);
         r600_init_atom(rctx, &rctx->cb_misc_state.atom, id++, evergreen_emit_cb_misc_state, 4);
-       r600_init_atom(rctx, &rctx->clip_misc_state.atom, id++, r600_emit_clip_misc_state, 6);
+       r600_init_atom(rctx, &rctx->clip_misc_state.atom, id++, r600_emit_clip_misc_state, 9);
         r600_init_atom(rctx, &rctx->clip_state.atom, id++, evergreen_emit_clip_state, 26);
         r600_init_atom(rctx, &rctx->db_misc_state.atom, id++, evergreen_emit_db_misc_state, 10);
         r600_init_atom(rctx, &rctx->db_state.atom, id++, evergreen_emit_db_state, 14);
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c

index 1cc3031..8b91372 100644 (file)
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -726,7 +726,7 @@ static void tex_fetch_args(
                  * That operand should be passed as a float value in the args array
                  * right after the coord vector. After packing it's not used anymore,
                  * that's why arg_count is not increased */
-               coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+               coords[4] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
         }
  
         if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
@@ -915,14 +915,17 @@ unsigned r600_llvm_compile(
         enum radeon_family family,
         struct r600_bytecode *bc,
         boolean *use_kill,
-       unsigned dump)
+       unsigned dump,
+       struct pipe_debug_callback *debug)
  {
         unsigned r;
         struct radeon_shader_binary binary;
         const char * gpu_family = r600_get_llvm_processor_name(family);
  
         memset(&binary, 0, sizeof(struct radeon_shader_binary));
-       r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL);
+       if (dump)
+               LLVMDumpModule(mod);
+       r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug);
  
         r = r600_create_shader(bc, &binary, use_kill);
  
diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h

index 9b5304d..f570b73 100644 (file)
--- a/src/gallium/drivers/r600/r600_llvm.h
+++ b/src/gallium/drivers/r600/r600_llvm.h
@@ -7,6 +7,7 @@
  #include "radeon/radeon_llvm.h"
  #include <llvm-c/Core.h>
  
+struct pipe_debug_callback;
  struct r600_bytecode;
  struct r600_shader_ctx;
  struct radeon_llvm_context;
@@ -22,7 +23,8 @@ unsigned r600_llvm_compile(
         enum radeon_family family,
         struct r600_bytecode *bc,
         boolean *use_kill,
-       unsigned dump);
+       unsigned dump,
+       struct pipe_debug_callback *debug);
  
  unsigned r600_create_shader(struct r600_bytecode *bc,
                 const struct radeon_shader_binary *binary,
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c

index 17006f7..08fdd36 100644 (file)
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -278,6 +278,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
         case PIPE_CAP_TGSI_TXQS:
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+       case PIPE_CAP_INVALIDATE_BUFFER:
                 return 1;
  
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
@@ -348,6 +349,14 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
         case PIPE_CAP_SHAREABLE_SHADERS:
         case PIPE_CAP_CLEAR_TEXTURE:
+       case PIPE_CAP_DRAW_PARAMETERS:
+       case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+       case PIPE_CAP_MULTI_DRAW_INDIRECT:
+       case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+       case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+       case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+       case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+       case PIPE_CAP_GENERATE_MIPMAP:
                 return 0;
  
         case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
@@ -522,6 +531,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+       case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
                 return 0;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
                 /* due to a bug in the shader compiler, some loops hang
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h

index 795fb9a..0e4dd16 100644 (file)
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -152,6 +152,7 @@ struct r600_clip_misc_state {
         unsigned clip_plane_enable; /* from rasterizer    */
         unsigned clip_dist_write;   /* from vertex shader */
         boolean clip_disable;       /* from vertex shader */
+       boolean vs_out_viewport;    /* from vertex shader */
  };
  
  struct r600_alphatest_state {
@@ -945,7 +946,6 @@ static inline uint32_t S_FIXED(float value, uint32_t frac_bits)
  {
         return value * (1 << frac_bits);
  }
-#define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
  
  /* 12.4 fixed-point */
  static inline unsigned r600_pack_float_12p4(float x)
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c

index d411b0b..df40f94 100644 (file)
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -162,7 +162,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
         struct r600_context *rctx = (struct r600_context *)ctx;
         struct r600_pipe_shader_selector *sel = shader->selector;
         int r;
-       bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
+       bool dump = r600_can_dump_shader(&rctx->screen->b,
+                                        tgsi_get_processor_type(sel->tokens));
         unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
         unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
         unsigned export_shader;
@@ -394,7 +395,7 @@ static int tgsi_last_instruction(unsigned writemask)
  static int tgsi_is_supported(struct r600_shader_ctx *ctx)
  {
         struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
-       int j;
+       unsigned j;
  
         if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
                 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
@@ -1166,7 +1167,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off
  */
  static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
  {
-       int i;
+       unsigned i;
         int num_baryc;
         struct tgsi_parse_context parse;
  
@@ -1585,7 +1586,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
  static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
  {
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-       int i;
+       unsigned i;
  
         for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
                 struct tgsi_full_src_register *src = &inst->Src[i];
@@ -1854,7 +1855,7 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re
  static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
  {
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-       int i;
+       unsigned i;
  
         for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
                 struct tgsi_full_src_register *src = &inst->Src[i];
@@ -2784,7 +2785,7 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
  
  static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
  {
-       int i;
+       unsigned i;
         int stride, outer_comps, inner_comps;
         int tessinner_idx = -1, tessouter_idx = -1;
         int r;
@@ -3238,7 +3239,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
         if (use_llvm) {
                 struct radeon_llvm_context radeon_llvm_ctx;
                 LLVMModuleRef mod;
-               bool dump = r600_can_dump_shader(&rscreen->b, tokens);
+               bool dump = r600_can_dump_shader(&rscreen->b,
+                                                tgsi_get_processor_type(tokens));
                 boolean use_kill = false;
  
                 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
@@ -3259,7 +3261,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
                 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
                 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
  
-               if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
+               if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill,
+                                     dump, &rctx->b.debug)) {
                         radeon_llvm_dispose(&radeon_llvm_ctx);
                         use_llvm = 0;
                         fprintf(stderr, "R600 LLVM backend failed to compile "
@@ -4424,7 +4427,7 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
                         memset(&alu, 0, sizeof(struct r600_bytecode_alu));
                         alu.op = ctx->inst_info->op;
                         for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-                               r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
+                               r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
                         }
                         alu.dst.sel = t1;
                         alu.dst.chan = i;
@@ -4791,7 +4794,7 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
         {
                 int chan;
                 int sel;
-               int i;
+               unsigned i;
  
                 if (ctx->bc->chip_class == CAYMAN) {
                         for (i = 0; i < 3; i++) {
@@ -7925,7 +7928,7 @@ static int tgsi_exp(struct r600_shader_ctx *ctx)
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
         struct r600_bytecode_alu alu;
         int r;
-       int i;
+       unsigned i;
  
         /* result.x = 2^floor(src); */
         if (inst->Dst[0].Register.WriteMask & 1) {
@@ -8054,7 +8057,7 @@ static int tgsi_log(struct r600_shader_ctx *ctx)
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
         struct r600_bytecode_alu alu;
         int r;
-       int i;
+       unsigned i;
  
         /* result.x = floor(log2(|src|)); */
         if (inst->Dst[0].Register.WriteMask & 1) {
@@ -8781,7 +8784,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
  
  static int tgsi_endloop(struct r600_shader_ctx *ctx)
  {
-       int i;
+       unsigned i;
  
         r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
  
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c

index e7ffe0d..f60e304 100644 (file)
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1768,7 +1768,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
  
                 if (!gs_ring_buffer) {
                         radeon_set_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4,
-                                              ALIGN_DIVUP(cb->buffer_size >> 4, 16));
+                                              DIV_ROUND_UP(cb->buffer_size, 256));
                         radeon_set_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8);
                 }
  
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c

index 6a66634..c3346f2 100644 (file)
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1106,10 +1106,10 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint
                                 tmpPtr[i] = util_cpu_to_le32(((uint32_t *)ptr)[i]);
                         }
  
-                       u_upload_data(rctx->b.uploader, 0, size, tmpPtr, &cb->buffer_offset, &cb->buffer);
+                       u_upload_data(rctx->b.uploader, 0, size, 256, tmpPtr, &cb->buffer_offset, &cb->buffer);
                         free(tmpPtr);
                 } else {
-                       u_upload_data(rctx->b.uploader, 0, input->buffer_size, ptr, &cb->buffer_offset, &cb->buffer);
+                       u_upload_data(rctx->b.uploader, 0, input->buffer_size, 256, ptr, &cb->buffer_offset, &cb->buffer);
                 }
                 /* account it in gtt */
                 rctx->b.gtt += input->buffer_size;
@@ -1377,11 +1377,13 @@ static void r600_update_clip_state(struct r600_context *rctx,
  {
         if (current->pa_cl_vs_out_cntl != rctx->clip_misc_state.pa_cl_vs_out_cntl ||
             current->shader.clip_dist_write != rctx->clip_misc_state.clip_dist_write ||
-           current->shader.vs_position_window_space != rctx->clip_misc_state.clip_disable) {
-                               rctx->clip_misc_state.pa_cl_vs_out_cntl = current->pa_cl_vs_out_cntl;
-                               rctx->clip_misc_state.clip_dist_write = current->shader.clip_dist_write;
-                               rctx->clip_misc_state.clip_disable = current->shader.vs_position_window_space;
-                               r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
+           current->shader.vs_position_window_space != rctx->clip_misc_state.clip_disable ||
+           current->shader.vs_out_viewport != rctx->clip_misc_state.vs_out_viewport) {
+               rctx->clip_misc_state.pa_cl_vs_out_cntl = current->pa_cl_vs_out_cntl;
+               rctx->clip_misc_state.clip_dist_write = current->shader.clip_dist_write;
+               rctx->clip_misc_state.clip_disable = current->shader.vs_position_window_space;
+               rctx->clip_misc_state.vs_out_viewport = current->shader.vs_out_viewport;
+               r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
         }
  }
  
@@ -1656,6 +1658,10 @@ void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom
         radeon_set_context_reg(cs, R_02881C_PA_CL_VS_OUT_CNTL,
                                state->pa_cl_vs_out_cntl |
                                (state->clip_plane_enable & state->clip_dist_write));
+       /* reuse needs to be set off if we write oViewport */
+       if (rctx->b.chip_class >= EVERGREEN)
+               radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF,
+                                      S_028AB4_REUSE_OFF(state->vs_out_viewport));
  }
  
  static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
@@ -1726,7 +1732,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
                                 }
                         }
  
-                       u_upload_alloc(rctx->b.uploader, start, count * 2,
+                       u_upload_alloc(rctx->b.uploader, start, count * 2, 256,
                                        &out_offset, &out_buffer, &ptr);
  
                         util_shorten_ubyte_elts_to_userptr(
@@ -1747,7 +1753,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
                 if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect ||
                                                  info.instance_count > 1 ||
                                                  info.count*ib.index_size > 20)) {
-                       u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size,
+                       u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size, 256,
                                       ib.user_buffer, &ib.offset, &ib.buffer);
                         ib.user_buffer = NULL;
                 }
diff --git a/src/gallium/drivers/radeon/Makefile.am b/src/gallium/drivers/radeon/Makefile.am

index 13d8976..a6fc145 100644 (file)
--- a/src/gallium/drivers/radeon/Makefile.am
+++ b/src/gallium/drivers/radeon/Makefile.am
@@ -16,7 +16,8 @@ libradeon_la_SOURCES = \
  if NEED_RADEON_LLVM
  
  AM_CFLAGS += \
-       $(LLVM_CFLAGS)
+       $(LLVM_CFLAGS) \
+       $(LIBELF_CFLAGS)
  
  libradeon_la_SOURCES += \
         $(LLVM_C_FILES)
@@ -24,7 +25,7 @@ libradeon_la_SOURCES += \
  libradeon_la_LIBADD = \
         $(CLOCK_LIB) \
         $(LLVM_LIBS) \
-       $(ELF_LIB)
+       $(LIBELF_LIBS)
  
  libradeon_la_LDFLAGS = \
         $(LLVM_LDFLAGS)
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c

index 1892527..6592c5b 100644 (file)
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -209,6 +209,36 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
         FREE(rbuffer);
  }
  
+static bool
+r600_do_invalidate_resource(struct r600_common_context *rctx,
+                           struct r600_resource *rbuffer)
+{
+       /* In AMD_pinned_memory, the user pointer association only gets
+        * broken when the buffer is explicitly re-allocated.
+        */
+       if (rctx->ws->buffer_is_user_ptr(rbuffer->buf))
+               return false;
+
+       /* Check if mapping this buffer would cause waiting for the GPU. */
+       if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+           !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+               rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
+       } else {
+               util_range_set_empty(&rbuffer->valid_buffer_range);
+       }
+
+       return true;
+}
+
+void r600_invalidate_resource(struct pipe_context *ctx,
+                             struct pipe_resource *resource)
+{
+       struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+       struct r600_resource *rbuffer = r600_resource(resource);
+
+       (void)r600_do_invalidate_resource(rctx, rbuffer);
+}
+
  static void *r600_buffer_get_transfer(struct pipe_context *ctx,
                                       struct pipe_resource *resource,
                                        unsigned level,
@@ -276,13 +306,10 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
             !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
                 assert(usage & PIPE_TRANSFER_WRITE);
  
-               /* Check if mapping this buffer would cause waiting for the GPU. */
-               if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
-                   !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
-                       rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
+               if (r600_do_invalidate_resource(rctx, rbuffer)) {
+                       /* At this point, the buffer is always idle. */
+                       usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
                 }
-               /* At this point, the buffer is always idle. */
-               usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
         }
         else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
                  !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
@@ -298,7 +325,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
                         struct r600_resource *staging = NULL;
  
                         u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
-                                      &offset, (struct pipe_resource**)&staging, (void**)&data);
+                                      256, &offset, (struct pipe_resource**)&staging, (void**)&data);
  
                         if (staging) {
                                 data += box->x % R600_MAP_BUFFER_ALIGNMENT;
diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c

index a835aee..fad7bde 100644 (file)
--- a/src/gallium/drivers/radeon/r600_perfcounter.c
+++ b/src/gallium/drivers/radeon/r600_perfcounter.c
@@ -202,9 +202,6 @@ static void r600_pc_query_add_result(struct r600_common_context *ctx,
         for (i = 0; i < query->num_counters; ++i) {
                 struct r600_pc_counter *counter = &query->counters[i];
  
-               if (counter->base == ~0)
-                       continue;
-
                 for (j = 0; j < counter->dwords; ++j) {
                         uint32_t value = results[counter->base + j * counter->stride];
                         result->batch[i].u32 += value;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c

index 9a5e987..e926f56 100644 (file)
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -85,7 +85,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
         /* Upload vertices. The hw rectangle has only 3 vertices,
          * I guess the 4th one is derived from the first 3.
          * The vertex specification should match u_blitter's vertex element state. */
-       u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, &offset, &buf, (void**)&vb);
+       u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb);
         if (!buf)
                 return;
  
@@ -227,6 +227,17 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
         return PIPE_UNKNOWN_CONTEXT_RESET;
  }
  
+static void r600_set_debug_callback(struct pipe_context *ctx,
+                                   const struct pipe_debug_callback *cb)
+{
+       struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+       if (cb)
+               rctx->debug = *cb;
+       else
+               memset(&rctx->debug, 0, sizeof(rctx->debug));
+}
+
  bool r600_common_context_init(struct r600_common_context *rctx,
                               struct r600_common_screen *rscreen)
  {
@@ -246,12 +257,14 @@ bool r600_common_context_init(struct r600_common_context *rctx,
         else
                 rctx->max_db = 4;
  
+       rctx->b.invalidate_resource = r600_invalidate_resource;
         rctx->b.transfer_map = u_transfer_map_vtbl;
         rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
         rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
         rctx->b.transfer_inline_write = u_default_transfer_inline_write;
          rctx->b.memory_barrier = r600_memory_barrier;
         rctx->b.flush = r600_flush_from_st;
+       rctx->b.set_debug_callback = r600_set_debug_callback;
  
         if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
                 rctx->b.get_device_reset_status = r600_get_reset_status;
@@ -272,9 +285,9 @@ bool r600_common_context_init(struct r600_common_context *rctx,
         if (!rctx->allocator_so_filled_size)
                 return false;
  
-       rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, 256,
+       rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
                                         PIPE_BIND_INDEX_BUFFER |
-                                       PIPE_BIND_CONSTANT_BUFFER);
+                                       PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
         if (!rctx->uploader)
                 return false;
  
@@ -999,13 +1012,9 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
  }
  
  bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-                         const struct tgsi_token *tokens)
+                         unsigned processor)
  {
-       /* Compute shader don't have tgsi_tokens */
-       if (!tokens)
-               return (rscreen->debug_flags & DBG_CS) != 0;
-
-       switch (tgsi_get_processor_type(tokens)) {
+       switch (processor) {
         case TGSI_PROCESSOR_VERTEX:
                 return (rscreen->debug_flags & DBG_VS) != 0;
         case TGSI_PROCESSOR_TESS_CTRL:
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h

index c3933b1..27f6e98 100644 (file)
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -440,6 +440,8 @@ struct r600_common_context {
          * the GPU addresses are updated. */
         struct list_head                texture_buffers;
  
+       struct pipe_debug_callback      debug;
+
         /* Copy one resource to another using async DMA. */
         void (*dma_copy)(struct pipe_context *ctx,
                          struct pipe_resource *dst,
@@ -498,6 +500,9 @@ struct pipe_resource *
  r600_buffer_from_user_memory(struct pipe_screen *screen,
                              const struct pipe_resource *templ,
                              void *user_memory);
+void
+r600_invalidate_resource(struct pipe_context *ctx,
+                        struct pipe_resource *resource);
  
  /* r600_common_pipe.c */
  void r600_draw_rectangle(struct blitter_context *blitter,
@@ -514,7 +519,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
  void r600_common_context_cleanup(struct r600_common_context *rctx);
  void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r);
  bool r600_can_dump_shader(struct r600_common_screen *rscreen,
-                         const struct tgsi_token *tokens);
+                         unsigned processor);
  void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
                               unsigned offset, unsigned size, unsigned value,
                               bool is_framebuffer);
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c

index ed0aefc..0aa19cd 100644 (file)
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -119,7 +119,7 @@ static void r600_query_sw_end(struct r600_common_context *rctx,
                 rctx->b.flush(&rctx->b, &query->fence, 0);
                 break;
         case R600_QUERY_DRAW_CALLS:
-               query->begin_result = rctx->num_draw_calls;
+               query->end_result = rctx->num_draw_calls;
                 break;
         case R600_QUERY_REQUESTED_VRAM:
         case R600_QUERY_REQUESTED_GTT:
@@ -141,10 +141,10 @@ static void r600_query_sw_end(struct r600_common_context *rctx,
                 query->begin_result = 0;
                 break;
         case R600_QUERY_NUM_COMPILATIONS:
-               query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+               query->end_result = p_atomic_read(&rctx->screen->num_compilations);
                 break;
         case R600_QUERY_NUM_SHADERS_CREATED:
-               query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+               query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
                 break;
         default:
                 unreachable("r600_query_sw_end: bad query type");
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c

index 6b2ebde..3d09876 100644 (file)
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -23,12 +23,15 @@
   * Authors: Tom Stellard <thomas.stellard@amd.com>
   *
   */
+
  #include "radeon_llvm_emit.h"
  #include "radeon_elf_util.h"
  #include "c11/threads.h"
  #include "gallivm/lp_bld_misc.h"
+#include "util/u_debug.h"
  #include "util/u_memory.h"
  #include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
  
  #include <llvm-c/Target.h>
  #include <llvm-c/TargetMachine.h>
@@ -123,16 +126,44 @@ LLVMTargetRef radeon_llvm_get_r600_target(const char *triple)
         return target;
  }
  
+struct radeon_llvm_diagnostics {
+       struct pipe_debug_callback *debug;
+       unsigned retval;
+};
+
  static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  {
-       if (LLVMGetDiagInfoSeverity(di) == LLVMDSError) {
-               unsigned int *diagnosticflag = (unsigned int *)context;
-               char *diaginfo_message = LLVMGetDiagInfoDescription(di);
+       struct radeon_llvm_diagnostics *diag = (struct radeon_llvm_diagnostics *)context;
+       LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
+       char *description = LLVMGetDiagInfoDescription(di);
+       const char *severity_str = NULL;
+
+       switch (severity) {
+       case LLVMDSError:
+               severity_str = "error";
+               break;
+       case LLVMDSWarning:
+               severity_str = "warning";
+               break;
+       case LLVMDSRemark:
+               severity_str = "remark";
+               break;
+       case LLVMDSNote:
+               severity_str = "note";
+               break;
+       default:
+               severity_str = "unknown";
+       }
  
-               *diagnosticflag = 1;
-               fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", diaginfo_message);
-               LLVMDisposeMessage(diaginfo_message);
+       pipe_debug_message(diag->debug, SHADER_INFO,
+                          "LLVM diagnostic (%s): %s", severity_str, description);
+
+       if (severity == LLVMDSError) {
+               diag->retval = 1;
+               fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description);
         }
+
+       LLVMDisposeMessage(description);
  }
  
  /**
@@ -141,22 +172,25 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
   * @returns 0 for success, 1 for failure
   */
  unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-                            const char *gpu_family, bool dump_ir, bool dump_asm,
-                            LLVMTargetMachineRef tm)
+                            const char *gpu_family,
+                            LLVMTargetMachineRef tm,
+                            struct pipe_debug_callback *debug)
  {
-
+       struct radeon_llvm_diagnostics diag;
         char cpu[CPU_STRING_LEN];
         char fs[FS_STRING_LEN];
         char *err;
         bool dispose_tm = false;
         LLVMContextRef llvm_ctx;
-       unsigned rval = 0;
         LLVMMemoryBufferRef out_buffer;
         unsigned buffer_size;
         const char *buffer_data;
         char triple[TRIPLE_STRING_LEN];
         LLVMBool mem_err;
  
+       diag.debug = debug;
+       diag.retval = 0;
+
         if (!tm) {
                 strncpy(triple, "r600--", TRIPLE_STRING_LEN);
                 LLVMTargetRef target = radeon_llvm_get_r600_target(triple);
@@ -165,20 +199,17 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
                 }
                 strncpy(cpu, gpu_family, CPU_STRING_LEN);
                 memset(fs, 0, sizeof(fs));
-               if (dump_asm)
-                       strncpy(fs, "+DumpCode", FS_STRING_LEN);
+               strncpy(fs, "+DumpCode", FS_STRING_LEN);
                 tm = LLVMCreateTargetMachine(target, triple, cpu, fs,
                                   LLVMCodeGenLevelDefault, LLVMRelocDefault,
                                                   LLVMCodeModelDefault);
                 dispose_tm = true;
         }
-       if (dump_ir)
-               LLVMDumpModule(M);
+
         /* Setup Diagnostic Handler*/
         llvm_ctx = LLVMGetModuleContext(M);
  
-       LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &rval);
-       rval = 0;
+       LLVMContextSetDiagnosticHandler(llvm_ctx, radeonDiagnosticHandler, &diag);
  
         /* Compile IR*/
         mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile, &err,
@@ -187,13 +218,11 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
         /* Process Errors/Warnings */
         if (mem_err) {
                 fprintf(stderr, "%s: %s", __FUNCTION__, err);
+               pipe_debug_message(debug, SHADER_INFO,
+                                  "LLVM emit error: %s", err);
                 FREE(err);
-               LLVMDisposeTargetMachine(tm);
-               return 1;
-       }
-
-       if (0 != rval) {
-               fprintf(stderr, "%s: Processing Diag Flag\n", __FUNCTION__);
+               diag.retval = 1;
+               goto out;
         }
  
         /* Extract Shader Code*/
@@ -205,8 +234,11 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
         /* Clean up */
         LLVMDisposeMemoryBuffer(out_buffer);
  
+out:
         if (dispose_tm) {
                 LLVMDisposeTargetMachine(tm);
         }
-       return rval;
+       if (diag.retval != 0)
+               pipe_debug_message(debug, SHADER_INFO, "LLVM compile failed");
+       return diag.retval;
  }
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h

index e20aed9..45f05a9 100644 (file)
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -31,6 +31,7 @@
  #include <llvm-c/TargetMachine.h>
  #include <stdbool.h>
  
+struct pipe_debug_callback;
  struct radeon_shader_binary;
  
  void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
@@ -38,7 +39,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
  LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
  
  unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-                            const char *gpu_family, bool dump_ir, bool dump_asm,
-                            LLVMTargetMachineRef tm);
+                            const char *gpu_family,
+                            LLVMTargetMachineRef tm,
+                            struct pipe_debug_callback *debug);
  
  #endif /* RADEON_LLVM_EMIT_H */
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h

index 4af6a18..ad30474 100644 (file)
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -530,6 +530,14 @@ struct radeon_winsys {
                                           void *pointer, unsigned size);
  
      /**
+     * Whether the buffer was created from a user pointer.
+     *
+     * \param buf       A winsys buffer object
+     * \return          whether \p buf was created via buffer_from_ptr
+     */
+    bool (*buffer_is_user_ptr)(struct pb_buffer *buf);
+
+    /**
       * Get a winsys handle from a winsys buffer. The internal structure
       * of the handle is platform-specific and only a winsys should access it.
       *
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c

index 2de237b..105a1b2 100644 (file)
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -196,7 +196,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
                         (tile_split << 11) | (mt << 8) | (array_mode << 3) |
                         lbpe;
                 cs->buf[cs->cdw++] = y << 16; /* | x */
-               cs->buf[cs->cdw++] = 0; /* z */;
+               cs->buf[cs->cdw++] = 0; /* z */
                 cs->buf[cs->cdw++] = addr & 0xfffffffc;
                 cs->buf[cs->cdw++] = addr >> 32;
                 cs->buf[cs->cdw++] = (pitch / bpe) - 1;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c

index 47a74ee..5a08cbf 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -67,9 +67,9 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
                                 program->shader.binary.global_symbol_offsets[i];
                 unsigned scratch_bytes_needed;
  
-               si_shader_binary_read_config(sctx->screen,
-                                               &program->shader, offset);
-               scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
+               si_shader_binary_read_config(&program->shader.binary,
+                                            &program->shader.config, offset);
+               scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
                 scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
         }
  
@@ -87,7 +87,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
          * to the maximum bytes needed, so it can compute the stride
          * correctly.
          */
-       program->shader.scratch_bytes_per_wave = scratch_bytes;
+       program->shader.config.scratch_bytes_per_wave = scratch_bytes;
  
         /* Patch the shader with the scratch buffer address. */
         si_shader_apply_scratch_relocs(sctx,
@@ -122,8 +122,12 @@ static void *si_create_compute_state(
                 for (i = 0; i < program->num_kernels; i++) {
                         LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i,
                                                          code, header->num_bytes);
-                       si_compile_llvm(sctx->screen, &program->kernels[i], sctx->tm,
-                                       mod);
+                       si_compile_llvm(sctx->screen, &program->kernels[i].binary,
+                                       &program->kernels[i].config, sctx->tm,
+                                       mod, &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+                       si_shader_dump(sctx->screen, &program->kernels[i],
+                                      &sctx->b.debug, TGSI_PROCESSOR_COMPUTE);
+                       si_shader_binary_upload(sctx->screen, &program->kernels[i]);
                         LLVMDisposeModule(mod);
                 }
         }
@@ -136,7 +140,11 @@ static void *si_create_compute_state(
          * the shader code to the GPU.
          */
         init_scratch_buffer(sctx, program);
-       si_shader_binary_read(sctx->screen, &program->shader);
+       si_shader_binary_read_config(&program->shader.binary,
+                                    &program->shader.config, 0);
+       si_shader_dump(sctx->screen, &program->shader, &sctx->b.debug,
+                      TGSI_PROCESSOR_COMPUTE);
+       si_shader_binary_upload(sctx->screen, &program->shader);
  
  #endif
         program->input_buffer = si_resource_create_custom(sctx->b.b.screen,
@@ -259,7 +267,7 @@ static void si_launch_grid(
  
  #if HAVE_LLVM >= 0x0306
         /* Read the config information */
-       si_shader_binary_read_config(sctx->screen, shader, pc);
+       si_shader_binary_read_config(&shader->binary, &shader->config, pc);
  #endif
  
         /* Upload the kernel arguments */
@@ -280,12 +288,12 @@ static void si_launch_grid(
  
         memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size);
  
-       if (shader->scratch_bytes_per_wave > 0) {
+       if (shader->config.scratch_bytes_per_wave > 0) {
  
                 COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
                             "Total Scratch: %u bytes\n", num_waves_for_scratch,
-                           shader->scratch_bytes_per_wave,
-                           shader->scratch_bytes_per_wave *
+                           shader->config.scratch_bytes_per_wave,
+                           shader->config.scratch_bytes_per_wave *
                             num_waves_for_scratch);
  
                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
@@ -312,7 +320,7 @@ static void si_launch_grid(
         si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va);
         si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12,
                 S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32)
-               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64));
+               |  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64));
  
         si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0);
         si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0);
@@ -360,9 +368,9 @@ static void si_launch_grid(
         si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
         si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
  
-       si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);
+       si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->config.rsrc1);
  
-       lds_blocks = shader->lds_size;
+       lds_blocks = shader->config.lds_size;
         /* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
          * blocks of 256 bytes, so if there are 4 bytes lds allocated in
          * the shader and 4 bytes allocated by the state tracker, then
@@ -376,10 +384,10 @@ static void si_launch_grid(
  
         assert(lds_blocks <= 0xFF);
  
-       shader->rsrc2 &= C_00B84C_LDS_SIZE;
-       shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+       shader->config.rsrc2 &= C_00B84C_LDS_SIZE;
+       shader->config.rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
  
-       si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
+       si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->config.rsrc2);
         si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);
  
         si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
@@ -401,7 +409,7 @@ static void si_launch_grid(
                  * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled.
                  */
                 S_00B860_WAVES(num_waves_for_scratch)
-               | S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10))
+               | S_00B860_WAVESIZE(shader->config.scratch_bytes_per_wave >> 10))
                 ;
  
         si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT);
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c

index c45f8c0..a07b1c5 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -28,8 +28,11 @@
  #include "si_shader.h"
  #include "sid.h"
  #include "sid_tables.h"
+#include "radeon/radeon_elf_util.h"
  #include "ddebug/dd_util.h"
+#include "util/u_memory.h"
  
+DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL)
  
  static void si_dump_shader(struct si_shader_ctx_state *state, const char *name,
                            FILE *f)
@@ -42,6 +45,98 @@ static void si_dump_shader(struct si_shader_ctx_state *state, const char *name,
         fprintf(f, "%s\n\n", state->current->binary.disasm_string);
  }
  
+/**
+ * Shader compiles can be overridden with arbitrary ELF objects by setting
+ * the environment variable RADEON_REPLACE_SHADERS=num1:filename1[;num2:filename2]
+ */
+bool si_replace_shader(unsigned num, struct radeon_shader_binary *binary)
+{
+       const char *p = debug_get_option_replace_shaders();
+       const char *semicolon;
+       char *copy = NULL;
+       FILE *f;
+       long filesize, nread;
+       char *buf = NULL;
+       bool replaced = false;
+
+       if (!p)
+               return false;
+
+       while (*p) {
+               unsigned long i;
+               char *endp;
+               i = strtoul(p, &endp, 0);
+
+               p = endp;
+               if (*p != ':') {
+                       fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n");
+                       exit(1);
+               }
+               ++p;
+
+               if (i == num)
+                       break;
+
+               p = strchr(p, ';');
+               if (!p)
+                       return false;
+               ++p;
+       }
+       if (!*p)
+               return false;
+
+       semicolon = strchr(p, ';');
+       if (semicolon) {
+               p = copy = strndup(p, semicolon - p);
+               if (!copy) {
+                       fprintf(stderr, "out of memory\n");
+                       return false;
+               }
+       }
+
+       fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p);
+
+       f = fopen(p, "r");
+       if (!f) {
+               perror("radeonsi: failed to open file");
+               goto out_free;
+       }
+
+       if (fseek(f, 0, SEEK_END) != 0)
+               goto file_error;
+
+       filesize = ftell(f);
+       if (filesize < 0)
+               goto file_error;
+
+       if (fseek(f, 0, SEEK_SET) != 0)
+               goto file_error;
+
+       buf = MALLOC(filesize);
+       if (!buf) {
+               fprintf(stderr, "out of memory\n");
+               goto out_close;
+       }
+
+       nread = fread(buf, 1, filesize, f);
+       if (nread != filesize)
+               goto file_error;
+
+       radeon_elf_read(buf, filesize, binary);
+       replaced = true;
+
+out_close:
+       fclose(f);
+out_free:
+       FREE(buf);
+       free(copy);
+       return replaced;
+
+file_error:
+       perror("radeonsi: reading shader");
+       goto out_close;
+}
+
  /* Parsed IBs are difficult to read without colors. Use "less -R file" to
   * read them, or use "aha -b -f file" to convert them to html.
   */
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c

index b3719de..d157a9f 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -109,7 +109,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
         if (!desc->list_dirty)
                 return true;
  
-       u_upload_alloc(sctx->b.uploader, 0, list_size,
+       u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
                        &desc->buffer_offset,
                        (struct pipe_resource**)&desc->buffer, &ptr);
         if (!desc->buffer)
@@ -391,7 +391,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
          * directly through a staging buffer and don't go through
          * the fine-grained upload path.
          */
-       u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
+       u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
                        (struct pipe_resource**)&desc->buffer, (void**)&ptr);
         if (!desc->buffer)
                 return false;
@@ -465,7 +465,7 @@ void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuf
  {
         void *tmp;
  
-       u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
+       u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset,
                        (struct pipe_resource**)rbuffer, &tmp);
         if (rbuffer)
                 util_memcpy_cpu_to_le32(tmp, ptr, size);
@@ -1011,19 +1011,19 @@ void si_init_all_descriptors(struct si_context *sctx)
  
         for (i = 0; i < SI_NUM_SHADERS; i++) {
                 si_init_buffer_resources(&sctx->const_buffers[i],
-                                        SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
+                                        SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
                                          RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
                 si_init_buffer_resources(&sctx->rw_buffers[i],
                                          SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
                                          RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
  
                 si_init_descriptors(&sctx->samplers[i].views.desc,
-                                   SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+                                   SI_SGPR_SAMPLER_VIEWS, 8, SI_NUM_SAMPLER_VIEWS);
                 si_init_descriptors(&sctx->samplers[i].states.desc,
-                                   SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
+                                   SI_SGPR_SAMPLER_STATES, 4, SI_NUM_SAMPLER_STATES);
         }
  
-       si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+       si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
                             4, SI_NUM_VERTEX_BUFFERS);
  
         /* Set pipe_context functions. */
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c

index a0ddff6..7ee1dae 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -436,7 +436,7 @@ static void si_pc_emit_select(struct r600_common_context *ctx,
  
                 dw = count + regs->num_prelude;
                 if (count >= regs->num_multi)
-                       count += regs->num_multi;
+                       dw += regs->num_multi;
                 radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
                 for (idx = 0; idx < regs->num_prelude; ++idx)
                         radeon_emit(cs, 0);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c

index ac13407..f6ff4a8 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -301,6 +301,9 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_TGSI_TXQS:
         case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+       case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+       case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+       case PIPE_CAP_INVALIDATE_BUFFER:
                 return 1;
  
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -340,6 +343,12 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
         case PIPE_CAP_VERTEXID_NOBASE:
         case PIPE_CAP_CLEAR_TEXTURE:
+       case PIPE_CAP_DRAW_PARAMETERS:
+       case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+       case PIPE_CAP_MULTI_DRAW_INDIRECT:
+       case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+       case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+       case PIPE_CAP_GENERATE_MIPMAP:
                 return 0;
  
         case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
@@ -512,6 +521,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
                 return 1;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
                 return 32;
+       case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+               return 0;
         }
         return 0;
  }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index 65c7e19..f83cb02 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -329,6 +329,7 @@ void si_init_cp_dma_functions(struct si_context *sctx);
  /* si_debug.c */
  void si_init_debug_functions(struct si_context *sctx);
  void si_check_vm_faults(struct si_context *sctx);
+bool si_replace_shader(unsigned num, struct radeon_shader_binary *binary);
  
  /* si_dma.c */
  void si_dma_copy(struct pipe_context *ctx,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index 4a67276..cc9718e 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -82,11 +82,11 @@ struct si_shader_context
         int param_es2gs_offset;
         LLVMTargetMachineRef tm;
         LLVMValueRef const_md;
-       LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
+       LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
         LLVMValueRef lds;
         LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
-       LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
-       LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
+       LLVMValueRef sampler_views[SI_NUM_SAMPLER_VIEWS];
+       LLVMValueRef sampler_states[SI_NUM_SAMPLER_STATES];
         LLVMValueRef so_buffers[4];
         LLVMValueRef esgs_ring;
         LLVMValueRef gsvs_ring[4];
@@ -394,7 +394,7 @@ static void declare_input_vs(
         LLVMValueRef input;
  
         /* Load the T list */
-       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFER);
+       t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
  
         t_offset = lp_build_const_int32(gallivm, input_index);
  
@@ -911,36 +911,6 @@ static void declare_input_fs(
  
         unsigned chan;
  
-       if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
-               for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-                       unsigned soa_index =
-                               radeon_llvm_reg_index_soa(input_index, chan);
-                       radeon_bld->inputs[soa_index] =
-                               LLVMGetParam(main_fn, SI_PARAM_POS_X_FLOAT + chan);
-
-                       if (chan == 3)
-                               /* RCP for fragcoord.w */
-                               radeon_bld->inputs[soa_index] =
-                                       LLVMBuildFDiv(gallivm->builder,
-                                                     lp_build_const_float(gallivm, 1.0f),
-                                                     radeon_bld->inputs[soa_index],
-                                                     "");
-               }
-               return;
-       }
-
-       if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
-               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-                       LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
-               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
-               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
-                       lp_build_const_float(gallivm, 0.0f);
-               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
-                       lp_build_const_float(gallivm, 1.0f);
-
-               return;
-       }
-
         shader->ps_input_param_offset[input_index] = shader->nparam++;
         attr_number = lp_build_const_int32(gallivm,
                                            shader->ps_input_param_offset[input_index]);
@@ -975,10 +945,8 @@ static void declare_input_fs(
  
                 face = LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
  
-               is_face_positive = LLVMBuildFCmp(gallivm->builder,
-                                                LLVMRealOGT, face,
-                                                lp_build_const_float(gallivm, 0.0f),
-                                                "");
+               is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+                                                face, uint->zero, "");
  
                 args[2] = params;
                 args[3] = interp_param;
@@ -1065,7 +1033,7 @@ static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld,
         struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
         struct gallivm_state *gallivm = &radeon_bld->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+       LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
         LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
         LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
  
@@ -1129,6 +1097,24 @@ static void declare_system_value(
                         assert(!"INVOCATIONID not implemented");
                 break;
  
+       case TGSI_SEMANTIC_POSITION:
+       {
+               LLVMValueRef pos[4] = {
+                       LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
+                       LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
+                       LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
+                       lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
+                                                LLVMGetParam(radeon_bld->main_fn,
+                                                             SI_PARAM_POS_W_FLOAT)),
+               };
+               value = lp_build_gather_values(gallivm, pos, 4);
+               break;
+       }
+
+       case TGSI_SEMANTIC_FACE:
+               value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
+               break;
+
         case TGSI_SEMANTIC_SAMPLEID:
                 value = get_sample_id(radeon_bld);
                 break;
@@ -1233,13 +1219,13 @@ static LLVMValueRef fetch_constant(
         }
  
         if (reg->Register.Dimension && reg->Dimension.Indirect) {
-               LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+               LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
                 LLVMValueRef index;
                 index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
                                                    reg->Dimension.Index);
                 bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
         } else
-               bufp = si_shader_ctx->const_resource[buf];
+               bufp = si_shader_ctx->const_buffers[buf];
  
         addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
         addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
@@ -1260,7 +1246,7 @@ static LLVMValueRef fetch_constant(
                 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
                                      lp_build_const_int32(base->gallivm, idx * 4));
  
-               result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
+               result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_buffers[buf],
                                    addr2, bld_base->base.elem_type);
  
                 result = radeon_llvm_emit_fetch_double(bld_base,
@@ -1302,18 +1288,8 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
         if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
  
-               if (cbuf >= 0 && cbuf < 8) {
+               if (cbuf >= 0 && cbuf < 8)
                         compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
-
-                       if (compressed)
-                               si_shader_ctx->shader->spi_shader_col_format |=
-                                       V_028714_SPI_SHADER_FP16_ABGR << (4 * cbuf);
-                       else
-                               si_shader_ctx->shader->spi_shader_col_format |=
-                                       V_028714_SPI_SHADER_32_ABGR << (4 * cbuf);
-
-                       si_shader_ctx->shader->cb_shader_mask |= 0xf << (4 * cbuf);
-               }
         }
  
         /* Set COMPR flag */
@@ -1333,34 +1309,19 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
                                                     LLVMInt32TypeInContext(base->gallivm->context),
                                                     pack_args, 2,
                                                     LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
-                       args[chan + 7] = args[chan + 5] =
+                       args[chan + 5] =
                                 LLVMBuildBitCast(base->gallivm->builder,
                                                  packed,
                                                  LLVMFloatTypeInContext(base->gallivm->context),
                                                  "");
+                       args[chan + 7] = base->undef;
                 }
         } else
                 memcpy(&args[5], values, sizeof(values[0]) * 4);
  }
  
-/* Load from output pointers and initialize arguments for the shader export intrinsic */
-static void si_llvm_init_export_args_load(struct lp_build_tgsi_context *bld_base,
-                                         LLVMValueRef *out_ptr,
-                                         unsigned target,
-                                         LLVMValueRef *args)
-{
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
-       LLVMValueRef values[4];
-       int i;
-
-       for (i = 0; i < 4; i++)
-               values[i] = LLVMBuildLoad(gallivm->builder, out_ptr[i], "");
-
-       si_llvm_init_export_args(bld_base, values, target, args);
-}
-
  static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
-                         LLVMValueRef alpha_ptr)
+                         LLVMValueRef alpha)
  {
         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
         struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1372,8 +1333,7 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
                 LLVMValueRef alpha_pass =
                         lp_build_cmp(&bld_base->base,
                                      si_shader_ctx->shader->key.ps.alpha_func,
-                                    LLVMBuildLoad(gallivm->builder, alpha_ptr, ""),
-                                    alpha_ref);
+                                    alpha, alpha_ref);
                 LLVMValueRef arg =
                         lp_build_select(&bld_base->base,
                                         alpha_pass,
@@ -1390,16 +1350,14 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
                                 LLVMVoidTypeInContext(gallivm->context),
                                 NULL, 0, 0);
         }
-
-       si_shader_ctx->shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
  }
  
-static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
-                                         LLVMValueRef alpha_ptr)
+static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
+                                                 LLVMValueRef alpha)
  {
         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
         struct gallivm_state *gallivm = bld_base->base.gallivm;
-       LLVMValueRef coverage, alpha;
+       LLVMValueRef coverage;
  
         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
         coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -1417,9 +1375,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base
                                  lp_build_const_float(gallivm,
                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
  
-       alpha = LLVMBuildLoad(gallivm->builder, alpha_ptr, "");
-       alpha = LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
-       LLVMBuildStore(gallivm->builder, alpha, alpha_ptr);
+       return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
  }
  
  static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
@@ -1432,7 +1388,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
         unsigned chan;
         unsigned const_chan;
         LLVMValueRef base_elt;
-       LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+       LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
         LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF);
         LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index);
  
@@ -2112,197 +2068,193 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
         FREE(outputs);
  }
  
-static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
+static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
+                          LLVMValueRef depth, LLVMValueRef stencil,
+                          LLVMValueRef samplemask)
  {
-       struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
-       struct si_shader * shader = si_shader_ctx->shader;
-       struct lp_build_context * base = &bld_base->base;
-       struct lp_build_context * uint = &bld_base->uint_bld;
-       struct tgsi_shader_info *info = &shader->selector->info;
-       LLVMBuilderRef builder = base->gallivm->builder;
+       struct si_screen *sscreen = si_shader_context(bld_base)->screen;
+       struct lp_build_context *base = &bld_base->base;
+       struct lp_build_context *uint = &bld_base->uint_bld;
         LLVMValueRef args[9];
-       LLVMValueRef last_args[9] = { 0 };
-       int depth_index = -1, stencil_index = -1, samplemask_index = -1;
-       int i;
+       unsigned mask = 0;
  
-       for (i = 0; i < info->num_outputs; i++) {
-               unsigned semantic_name = info->output_semantic_name[i];
-               unsigned semantic_index = info->output_semantic_index[i];
-               unsigned target;
-               LLVMValueRef alpha_ptr;
+       assert(depth || stencil || samplemask);
  
-               /* Select the correct target */
-               switch (semantic_name) {
-               case TGSI_SEMANTIC_POSITION:
-                       depth_index = i;
-                       continue;
-               case TGSI_SEMANTIC_STENCIL:
-                       stencil_index = i;
-                       continue;
-               case TGSI_SEMANTIC_SAMPLEMASK:
-                       samplemask_index = i;
-                       continue;
-               case TGSI_SEMANTIC_COLOR:
-                       target = V_008DFC_SQ_EXP_MRT + semantic_index;
-                       alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3];
+       args[1] = uint->one; /* whether the EXEC mask is valid */
+       args[2] = uint->one; /* DONE bit */
  
-                       if (si_shader_ctx->shader->key.ps.clamp_color) {
-                               for (int j = 0; j < 4; j++) {
-                                       LLVMValueRef ptr = si_shader_ctx->radeon_bld.soa.outputs[i][j];
-                                       LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+       /* Specify the target we are exporting */
+       args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
  
-                                       result = radeon_llvm_saturate(bld_base, result);
-                                       LLVMBuildStore(builder, result, ptr);
-                               }
-                       }
+       args[4] = uint->zero; /* COMP flag */
+       args[5] = base->undef; /* R, depth */
+       args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
+       args[7] = base->undef; /* B, sample mask */
+       args[8] = base->undef; /* A, alpha to mask */
  
-                       if (si_shader_ctx->shader->key.ps.alpha_to_one)
-                               LLVMBuildStore(base->gallivm->builder,
-                                              base->one, alpha_ptr);
+       if (depth) {
+               args[5] = depth;
+               mask |= 0x1;
+       }
  
-                       if (semantic_index == 0 &&
-                           si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
-                               si_alpha_test(bld_base, alpha_ptr);
+       if (stencil) {
+               args[6] = stencil;
+               mask |= 0x2;
+       }
  
-                       if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
-                               si_scale_alpha_by_sample_mask(bld_base, alpha_ptr);
+       if (samplemask) {
+               args[7] = samplemask;
+               mask |= 0x4;
+       }
  
-                       break;
-               default:
-                       target = 0;
-                       fprintf(stderr,
-                               "Warning: SI unhandled fs output type:%d\n",
-                               semantic_name);
-               }
+       /* SI (except OLAND) has a bug that it only looks
+        * at the X writemask component. */
+       if (sscreen->b.chip_class == SI &&
+           sscreen->b.family != CHIP_OLAND)
+               mask |= 0x1;
  
-               si_llvm_init_export_args_load(bld_base,
-                                             si_shader_ctx->radeon_bld.soa.outputs[i],
-                                             target, args);
-
-               if (semantic_name == TGSI_SEMANTIC_COLOR) {
-                       /* If there is an export instruction waiting to be emitted, do so now. */
-                       if (last_args[0]) {
-                               lp_build_intrinsic(base->gallivm->builder,
-                                                  "llvm.SI.export",
-                                                  LLVMVoidTypeInContext(base->gallivm->context),
-                                                  last_args, 9, 0);
-                       }
+       /* Specify which components to enable */
+       args[0] = lp_build_const_int32(base->gallivm, mask);
  
-                       /* This instruction will be emitted at the end of the shader. */
-                       memcpy(last_args, args, sizeof(args));
-
-                       /* Handle FS_COLOR0_WRITES_ALL_CBUFS. */
-                       if (shader->selector->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
-                           semantic_index == 0 &&
-                           si_shader_ctx->shader->key.ps.last_cbuf > 0) {
-                               for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
-                                       si_llvm_init_export_args_load(bld_base,
-                                                                     si_shader_ctx->radeon_bld.soa.outputs[i],
-                                                                     V_008DFC_SQ_EXP_MRT + c, args);
-                                       lp_build_intrinsic(base->gallivm->builder,
-                                                          "llvm.SI.export",
-                                                          LLVMVoidTypeInContext(base->gallivm->context),
-                                                          args, 9, 0);
-                               }
-                       }
-               } else {
-                       lp_build_intrinsic(base->gallivm->builder,
-                                          "llvm.SI.export",
+       lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+                          LLVMVoidTypeInContext(base->gallivm->context),
+                          args, 9, 0);
+}
+
+static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
+                               LLVMValueRef *color, unsigned index,
+                               bool is_last)
+{
+       struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+       struct lp_build_context *base = &bld_base->base;
+       LLVMValueRef args[9];
+       int i;
+
+       /* Clamp color */
+       if (si_shader_ctx->shader->key.ps.clamp_color)
+               for (i = 0; i < 4; i++)
+                       color[i] = radeon_llvm_saturate(bld_base, color[i]);
+
+       /* Alpha to one */
+       if (si_shader_ctx->shader->key.ps.alpha_to_one)
+               color[3] = base->one;
+
+       /* Alpha test */
+       if (index == 0 &&
+           si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
+               si_alpha_test(bld_base, color[3]);
+
+       /* Line & polygon smoothing */
+       if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
+               color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]);
+
+       /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+       if (index == 0 &&
+           si_shader_ctx->shader->key.ps.last_cbuf > 0) {
+               for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
+                       si_llvm_init_export_args(bld_base, color,
+                                                V_008DFC_SQ_EXP_MRT + c, args);
+                       lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
                                            LLVMVoidTypeInContext(base->gallivm->context),
                                            args, 9, 0);
                 }
         }
  
-       if (depth_index >= 0 || stencil_index >= 0 || samplemask_index >= 0) {
-               LLVMValueRef out_ptr;
-               unsigned mask = 0;
-
-               /* Specify the target we are exporting */
-               args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
-
-               args[5] = base->zero; /* R, depth */
-               args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
-               args[7] = base->zero; /* B, sample mask */
-               args[8] = base->zero; /* A, alpha to mask */
-
-               if (depth_index >= 0) {
-                       out_ptr = si_shader_ctx->radeon_bld.soa.outputs[depth_index][2];
-                       args[5] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-                       mask |= 0x1;
-                       si_shader_ctx->shader->db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1);
-               }
-
-               if (stencil_index >= 0) {
-                       out_ptr = si_shader_ctx->radeon_bld.soa.outputs[stencil_index][1];
-                       args[6] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-                       mask |= 0x2;
-                       si_shader_ctx->shader->db_shader_control |=
-                               S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(1);
-               }
-
-               if (samplemask_index >= 0) {
-                       out_ptr = si_shader_ctx->radeon_bld.soa.outputs[samplemask_index][0];
-                       args[7] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
-                       mask |= 0x4;
-                       si_shader_ctx->shader->db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(1);
-               }
-
-               /* SI (except OLAND) has a bug that it only looks
-                * at the X writemask component. */
-               if (si_shader_ctx->screen->b.chip_class == SI &&
-                   si_shader_ctx->screen->b.family != CHIP_OLAND)
-                       mask |= 0x1;
+       /* Export */
+       si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
+                                args);
+       if (is_last) {
+               args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
+               args[2] = bld_base->uint_bld.one; /* DONE bit */
+       }
+       lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+                          LLVMVoidTypeInContext(base->gallivm->context),
+                          args, 9, 0);
+}
  
-               if (samplemask_index >= 0)
-                       si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_ABGR;
-               else if (stencil_index >= 0)
-                       si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_GR;
-               else
-                       si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_R;
+static void si_export_null(struct lp_build_tgsi_context *bld_base)
+{
+       struct lp_build_context *base = &bld_base->base;
+       struct lp_build_context *uint = &bld_base->uint_bld;
+       LLVMValueRef args[9];
  
-               /* Specify which components to enable */
-               args[0] = lp_build_const_int32(base->gallivm, mask);
+       args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
+       args[1] = uint->one; /* whether the EXEC mask is valid */
+       args[2] = uint->one; /* DONE bit */
+       args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
+       args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
+       args[5] = uint->undef; /* R */
+       args[6] = uint->undef; /* G */
+       args[7] = uint->undef; /* B */
+       args[8] = uint->undef; /* A */
+
+       lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
+                          LLVMVoidTypeInContext(base->gallivm->context),
+                          args, 9, 0);
+}
  
-               args[1] =
-               args[2] =
-               args[4] = uint->zero;
+static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
+{
+       struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
+       struct si_shader * shader = si_shader_ctx->shader;
+       struct lp_build_context * base = &bld_base->base;
+       struct tgsi_shader_info *info = &shader->selector->info;
+       LLVMBuilderRef builder = base->gallivm->builder;
+       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+       int last_color_export = -1;
+       int i;
  
-               if (last_args[0])
-                       lp_build_intrinsic(base->gallivm->builder,
-                                          "llvm.SI.export",
-                                          LLVMVoidTypeInContext(base->gallivm->context),
-                                          args, 9, 0);
-               else
-                       memcpy(last_args, args, sizeof(args));
+       /* If there are no outputs, add a dummy export. */
+       if (!info->num_outputs) {
+               si_export_null(bld_base);
+               return;
         }
  
-       if (!last_args[0]) {
-               /* Specify which components to enable */
-               last_args[0] = lp_build_const_int32(base->gallivm, 0x0);
+       /* Determine the last export. If MRTZ is present, it's always last.
+        * Otherwise, find the last color export.
+        */
+       if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask)
+               for (i = 0; i < info->num_outputs; i++)
+                       if (info->output_semantic_name[i] == TGSI_SEMANTIC_COLOR)
+                               last_color_export = i;
  
-               /* Specify the target we are exporting */
-               last_args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
+       for (i = 0; i < info->num_outputs; i++) {
+               unsigned semantic_name = info->output_semantic_name[i];
+               unsigned semantic_index = info->output_semantic_index[i];
+               unsigned j;
+               LLVMValueRef color[4] = {};
  
-               /* Set COMPR flag to zero to export data as 32-bit */
-               last_args[4] = uint->zero;
+               /* Select the correct target */
+               switch (semantic_name) {
+               case TGSI_SEMANTIC_POSITION:
+                       depth = LLVMBuildLoad(builder,
+                                             si_shader_ctx->radeon_bld.soa.outputs[i][2], "");
+                       break;
+               case TGSI_SEMANTIC_STENCIL:
+                       stencil = LLVMBuildLoad(builder,
+                                               si_shader_ctx->radeon_bld.soa.outputs[i][1], "");
+                       break;
+               case TGSI_SEMANTIC_SAMPLEMASK:
+                       samplemask = LLVMBuildLoad(builder,
+                                                  si_shader_ctx->radeon_bld.soa.outputs[i][0], "");
+                       break;
+               case TGSI_SEMANTIC_COLOR:
+                       for (j = 0; j < 4; j++)
+                               color[j] = LLVMBuildLoad(builder,
+                                                        si_shader_ctx->radeon_bld.soa.outputs[i][j], "");
  
-               /* dummy bits */
-               last_args[5]= uint->zero;
-               last_args[6]= uint->zero;
-               last_args[7]= uint->zero;
-               last_args[8]= uint->zero;
+                       si_export_mrt_color(bld_base, color, semantic_index,
+                                           last_color_export == i);
+                       break;
+               default:
+                       fprintf(stderr,
+                               "Warning: SI unhandled fs output type:%d\n",
+                               semantic_name);
+               }
         }
  
-       /* Specify whether the EXEC mask represents the valid mask */
-       last_args[1] = uint->one;
-
-       /* Specify that this is the last export */
-       last_args[2] = lp_build_const_int32(base->gallivm, 1);
-
-       lp_build_intrinsic(base->gallivm->builder,
-                          "llvm.SI.export",
-                          LLVMVoidTypeInContext(base->gallivm->context),
-                          last_args, 9, 0);
+       if (depth || stencil || samplemask)
+               si_export_mrt_z(bld_base, depth, stencil, samplemask);
  }
  
  static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
@@ -2390,10 +2342,10 @@ static void tex_fetch_ptrs(
  
                 ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
  
-               *res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+               *res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
                 *res_ptr = build_indexed_load_const(si_shader_ctx, *res_ptr, ind_index);
  
-               *samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+               *samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
                 *samp_ptr = build_indexed_load_const(si_shader_ctx, *samp_ptr, ind_index);
  
                 if (target == TGSI_TEXTURE_2D_MSAA ||
@@ -2401,13 +2353,13 @@ static void tex_fetch_ptrs(
                         ind_index = LLVMBuildAdd(gallivm->builder, ind_index,
                                                  lp_build_const_int32(gallivm,
                                                                       SI_FMASK_TEX_OFFSET), "");
-                       *fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+                       *fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
                         *fmask_ptr = build_indexed_load_const(si_shader_ctx, *fmask_ptr, ind_index);
                 }
         } else {
-               *res_ptr = si_shader_ctx->resources[sampler_index];
-               *samp_ptr = si_shader_ctx->samplers[sampler_index];
-               *fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
+               *res_ptr = si_shader_ctx->sampler_views[sampler_index];
+               *samp_ptr = si_shader_ctx->sampler_states[sampler_index];
+               *fmask_ptr = si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + sampler_index];
         }
  }
  
@@ -2487,7 +2439,7 @@ static void tex_fetch_args(
                 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
                 emit_data->args[0] = res;
                 emit_data->args[1] = bld_base->uint_bld.zero;
-               emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
+               emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
                 emit_data->arg_count = 3;
                 return;
         }
@@ -2536,12 +2488,12 @@ static void tex_fetch_args(
         if (opcode == TGSI_OPCODE_TXB)
                 address[count++] = coords[3];
         if (opcode == TGSI_OPCODE_TXB2)
-               address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+               address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
  
         /* Pack depth comparison value */
         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-                       address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+                       address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
                 } else {
                         assert(ref_pos >= 0);
                         address[count++] = coords[ref_pos];
@@ -2612,7 +2564,7 @@ static void tex_fetch_args(
         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
                 address[count++] = coords[3];
         else if (opcode == TGSI_OPCODE_TXL2)
-               address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+               address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
  
         if (count > 16) {
                 assert(!"Cannot handle more than 16 texture address parameters");
@@ -3105,10 +3057,10 @@ static void interp_fetch_args(
                 /* offset is in second src, first two channels */
                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
                                                          emit_data->inst, 1,
-                                                        0);
+                                                        TGSI_CHAN_X);
                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
                                                          emit_data->inst, 1,
-                                                        1);
+                                                        TGSI_CHAN_Y);
                 emit_data->arg_count = 2;
         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
                 LLVMValueRef sample_position;
@@ -3119,7 +3071,7 @@ static void interp_fetch_args(
                  * and place into first two channels.
                  */
                 sample_id = lp_build_emit_fetch(bld_base,
-                                               emit_data->inst, 1, 0);
+                                               emit_data->inst, 1, TGSI_CHAN_X);
                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
                                              LLVMInt32TypeInContext(gallivm->context),
                                              "");
@@ -3432,15 +3384,15 @@ static void create_function(struct si_shader_context *si_shader_ctx)
         v16i8 = LLVMVectorType(i8, 16);
  
         params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS);
-       params[SI_PARAM_CONST] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
-       params[SI_PARAM_SAMPLER] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
-       params[SI_PARAM_RESOURCE] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
-       last_array_pointer = SI_PARAM_RESOURCE;
+       params[SI_PARAM_CONST_BUFFERS] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
+       params[SI_PARAM_SAMPLER_STATES] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
+       params[SI_PARAM_SAMPLER_VIEWS] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
+       last_array_pointer = SI_PARAM_SAMPLER_VIEWS;
  
         switch (si_shader_ctx->type) {
         case TGSI_PROCESSOR_VERTEX:
-               params[SI_PARAM_VERTEX_BUFFER] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
-               last_array_pointer = SI_PARAM_VERTEX_BUFFER;
+               params[SI_PARAM_VERTEX_BUFFERS] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
+               last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
                 params[SI_PARAM_BASE_VERTEX] = i32;
                 params[SI_PARAM_START_INSTANCE] = i32;
                 num_params = SI_PARAM_START_INSTANCE+1;
@@ -3452,8 +3404,8 @@ static void create_function(struct si_shader_context *si_shader_ctx)
                         num_params = SI_PARAM_LS_OUT_LAYOUT+1;
                 } else {
                         if (shader->is_gs_copy_shader) {
-                               last_array_pointer = SI_PARAM_CONST;
-                               num_params = SI_PARAM_CONST+1;
+                               last_array_pointer = SI_PARAM_CONST_BUFFERS;
+                               num_params = SI_PARAM_CONST_BUFFERS+1;
                         } else {
                                 params[SI_PARAM_VS_STATE_BITS] = i32;
                                 num_params = SI_PARAM_VS_STATE_BITS+1;
@@ -3540,7 +3492,7 @@ static void create_function(struct si_shader_context *si_shader_ctx)
                 params[SI_PARAM_POS_Y_FLOAT] = f32;
                 params[SI_PARAM_POS_Z_FLOAT] = f32;
                 params[SI_PARAM_POS_W_FLOAT] = f32;
-               params[SI_PARAM_FRONT_FACE] = f32;
+               params[SI_PARAM_FRONT_FACE] = i32;
                 params[SI_PARAM_ANCILLARY] = i32;
                 params[SI_PARAM_SAMPLE_COVERAGE] = f32;
                 params[SI_PARAM_POS_FIXED_PT] = f32;
@@ -3610,7 +3562,7 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
         struct gallivm_state * gallivm = bld_base->base.gallivm;
         const struct tgsi_shader_info * info = bld_base->info;
         unsigned buf;
-       LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+       LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
  
         for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
                 unsigned i, num_const = info->const_file_max[buf] + 1;
@@ -3622,14 +3574,14 @@ static void preload_constants(struct si_shader_context *si_shader_ctx)
                 si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
  
                 /* Load the resource descriptor */
-               si_shader_ctx->const_resource[buf] =
+               si_shader_ctx->const_buffers[buf] =
                         build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf));
  
                 /* Load the constants, we rely on the code sinking to do the rest */
                 for (i = 0; i < num_const * 4; ++i) {
                         si_shader_ctx->constants[buf][i] =
                                 buffer_load_const(gallivm->builder,
-                                       si_shader_ctx->const_resource[buf],
+                                       si_shader_ctx->const_buffers[buf],
                                         lp_build_const_int32(gallivm, i * 4),
                                         bld_base->base.elem_type);
                 }
@@ -3650,23 +3602,23 @@ static void preload_samplers(struct si_shader_context *si_shader_ctx)
         if (num_samplers == 0)
                 return;
  
-       res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
-       samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+       res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS);
+       samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES);
  
         /* Load the resources and samplers, we rely on the code sinking to do the rest */
         for (i = 0; i < num_samplers; ++i) {
                 /* Resource */
                 offset = lp_build_const_int32(gallivm, i);
-               si_shader_ctx->resources[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
+               si_shader_ctx->sampler_views[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
  
                 /* Sampler */
                 offset = lp_build_const_int32(gallivm, i);
-               si_shader_ctx->samplers[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
+               si_shader_ctx->sampler_states[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
  
                 /* FMASK resource */
                 if (info->is_msaa_sampler[i]) {
                         offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i);
-                       si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + i] =
+                       si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + i] =
                                 build_indexed_load_const(si_shader_ctx, res_ptr, offset);
                 }
         }
@@ -3741,20 +3693,19 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
         }
  }
  
-void si_shader_binary_read_config(const struct si_screen *sscreen,
-                               struct si_shader *shader,
-                               unsigned symbol_offset)
+void si_shader_binary_read_config(struct radeon_shader_binary *binary,
+                                 struct si_shader_config *conf,
+                                 unsigned symbol_offset)
  {
         unsigned i;
         const unsigned char *config =
-               radeon_shader_binary_config_start(&shader->binary,
-                                               symbol_offset);
+               radeon_shader_binary_config_start(binary, symbol_offset);
  
         /* XXX: We may be able to emit some of these values directly rather than
          * extracting fields to be emitted later.
          */
  
-       for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) {
+       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
                 switch (reg) {
@@ -3762,25 +3713,25 @@ void si_shader_binary_read_config(const struct si_screen *sscreen,
                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
                 case R_00B848_COMPUTE_PGM_RSRC1:
-                       shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
-                       shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
-                       shader->float_mode =  G_00B028_FLOAT_MODE(value);
-                       shader->rsrc1 = value;
+                       conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+                       conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+                       conf->float_mode =  G_00B028_FLOAT_MODE(value);
+                       conf->rsrc1 = value;
                         break;
                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
-                       shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+                       conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
                         break;
                 case R_00B84C_COMPUTE_PGM_RSRC2:
-                       shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
-                       shader->rsrc2 = value;
+                       conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
+                       conf->rsrc2 = value;
                         break;
                 case R_0286CC_SPI_PS_INPUT_ENA:
-                       shader->spi_ps_input_ena = value;
+                       conf->spi_ps_input_ena = value;
                         break;
                 case R_0286E8_SPI_TMPRING_SIZE:
                 case R_00B860_COMPUTE_TMPRING_SIZE:
                         /* WAVESIZE is in units of 256 dwords. */
-                       shader->scratch_bytes_per_wave =
+                       conf->scratch_bytes_per_wave =
                                 G_00B860_WAVESIZE(value) * 256 * 4 * 1;
                         break;
                 default:
@@ -3799,7 +3750,7 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
         uint32_t scratch_rsrc_dword0 = scratch_va;
         uint32_t scratch_rsrc_dword1 =
                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
-               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+               |  S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64);
  
         for (i = 0 ; i < shader->binary.reloc_count; i++) {
                 const struct radeon_shader_reloc *reloc =
@@ -3840,74 +3791,124 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
         return 0;
  }
  
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
+static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
+                                      struct pipe_debug_callback *debug)
  {
-       const struct radeon_shader_binary *binary = &shader->binary;
-       unsigned i;
-       int r;
-       bool dump  = r600_can_dump_shader(&sscreen->b,
-               shader->selector ? shader->selector->tokens : NULL);
+       char *line, *p;
+       unsigned i, count;
+
+       if (binary->disasm_string) {
+               fprintf(stderr, "\nShader Disassembly:\n\n");
+               fprintf(stderr, "%s\n", binary->disasm_string);
+
+               if (debug && debug->debug_message) {
+                       /* Very long debug messages are cut off, so send the
+                        * disassembly one line at a time. This causes more
+                        * overhead, but on the plus side it simplifies
+                        * parsing of resulting logs.
+                        */
+                       pipe_debug_message(debug, SHADER_INFO,
+                                          "Shader Disassembly Begin");
  
-       si_shader_binary_read_config(sscreen, shader, 0);
-       r = si_shader_binary_upload(sscreen, shader);
-       if (r)
-               return r;
-
-       if (dump) {
-               if (!(sscreen->b.debug_flags & DBG_NO_ASM)) {
-                       if (binary->disasm_string) {
-                               fprintf(stderr, "\nShader Disassembly:\n\n");
-                               fprintf(stderr, "%s\n", binary->disasm_string);
-                       } else {
-                               fprintf(stderr, "SI CODE:\n");
-                               for (i = 0; i < binary->code_size; i+=4 ) {
-                                       fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
-                                       binary->code[i + 2], binary->code[i + 1],
-                                       binary->code[i]);
+                       line = binary->disasm_string;
+                       while (*line) {
+                               p = strchrnul(line, '\n');
+                               count = p - line;
+
+                               if (count) {
+                                       pipe_debug_message(debug, SHADER_INFO,
+                                                          "%.*s", count, line);
                                 }
+
+                               if (!*p)
+                                       break;
+                               line = p + 1;
                         }
+
+                       pipe_debug_message(debug, SHADER_INFO,
+                                          "Shader Disassembly End");
+               }
+       } else {
+               fprintf(stderr, "SI CODE:\n");
+               for (i = 0; i < binary->code_size; i += 4) {
+                       fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i,
+                               binary->code[i + 3], binary->code[i + 2],
+                               binary->code[i + 1], binary->code[i]);
                 }
+       }
+}
  
+static void si_shader_dump_stats(struct si_screen *sscreen,
+                                struct si_shader_config *conf,
+                                unsigned code_size,
+                                struct pipe_debug_callback *debug,
+                                unsigned processor)
+{
+       if (r600_can_dump_shader(&sscreen->b, processor)) {
                 fprintf(stderr, "*** SHADER STATS ***\n"
                         "SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
                         "Scratch: %d bytes per wave\n********************\n",
-                       shader->num_sgprs, shader->num_vgprs, binary->code_size,
-                       shader->lds_size, shader->scratch_bytes_per_wave);
+                       conf->num_sgprs, conf->num_vgprs, code_size,
+                       conf->lds_size, conf->scratch_bytes_per_wave);
         }
-       return 0;
+
+       pipe_debug_message(debug, SHADER_INFO,
+                          "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
+                          conf->num_sgprs, conf->num_vgprs, code_size,
+                          conf->lds_size, conf->scratch_bytes_per_wave);
+}
+
+void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+                   struct pipe_debug_callback *debug, unsigned processor)
+{
+       if (r600_can_dump_shader(&sscreen->b, processor))
+               if (!(sscreen->b.debug_flags & DBG_NO_ASM))
+                       si_shader_dump_disassembly(&shader->binary, debug);
+
+       si_shader_dump_stats(sscreen, &shader->config,
+                            shader->binary.code_size, debug, processor);
  }
  
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-                   LLVMTargetMachineRef tm, LLVMModuleRef mod)
+int si_compile_llvm(struct si_screen *sscreen,
+                   struct radeon_shader_binary *binary,
+                   struct si_shader_config *conf,
+                   LLVMTargetMachineRef tm,
+                   LLVMModuleRef mod,
+                   struct pipe_debug_callback *debug,
+                   unsigned processor)
  {
         int r = 0;
-       bool dump_asm = r600_can_dump_shader(&sscreen->b,
-                               shader->selector ? shader->selector->tokens : NULL);
-       bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR);
-
-       r = radeon_llvm_compile(mod, &shader->binary,
-               r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm);
-       if (r)
-               return r;
-
-       r = si_shader_binary_read(sscreen, shader);
-
-       FREE(shader->binary.config);
-       FREE(shader->binary.rodata);
-       FREE(shader->binary.global_symbol_offsets);
-       if (shader->scratch_bytes_per_wave == 0) {
-               FREE(shader->binary.code);
-               FREE(shader->binary.relocs);
-               memset(&shader->binary, 0,
-                      offsetof(struct radeon_shader_binary, disasm_string));
+       unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
+
+       if (r600_can_dump_shader(&sscreen->b, processor)) {
+               fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
+
+               if (!(sscreen->b.debug_flags & DBG_NO_IR))
+                       LLVMDumpModule(mod);
         }
+
+       if (!si_replace_shader(count, binary)) {
+               r = radeon_llvm_compile(mod, binary,
+                       r600_get_llvm_processor_name(sscreen->b.family), tm,
+                       debug);
+               if (r)
+                       return r;
+       }
+
+       si_shader_binary_read_config(binary, conf, 0);
+
+       FREE(binary->config);
+       FREE(binary->global_symbol_offsets);
+       binary->config = NULL;
+       binary->global_symbol_offsets = NULL;
         return r;
  }
  
  /* Generate code for the hardware VS shader stage to go with a geometry shader */
  static int si_generate_gs_copy_shader(struct si_screen *sscreen,
                                       struct si_shader_context *si_shader_ctx,
-                                     struct si_shader *gs, bool dump)
+                                     struct si_shader *gs, bool dump,
+                                     struct pipe_debug_callback *debug)
  {
         struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
@@ -3973,8 +3974,15 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
         if (dump)
                 fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
  
-       r = si_compile_llvm(sscreen, si_shader_ctx->shader,
-                           si_shader_ctx->tm, bld_base->base.gallivm->module);
+       r = si_compile_llvm(sscreen, &si_shader_ctx->shader->binary,
+                           &si_shader_ctx->shader->config, si_shader_ctx->tm,
+                           bld_base->base.gallivm->module,
+                           debug, TGSI_PROCESSOR_GEOMETRY);
+       if (!r) {
+               si_shader_dump(sscreen, si_shader_ctx->shader, debug,
+                              TGSI_PROCESSOR_GEOMETRY);
+               r = si_shader_binary_upload(sscreen, si_shader_ctx->shader);
+       }
  
         radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
  
@@ -4028,7 +4036,8 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
  }
  
  int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-                    struct si_shader *shader)
+                    struct si_shader *shader,
+                    struct pipe_debug_callback *debug)
  {
         struct si_shader_selector *sel = shader->selector;
         struct tgsi_token *tokens = sel->tokens;
@@ -4039,11 +4048,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
         int r = 0;
         bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
                             shader->key.ps.poly_stipple;
-       bool dump = r600_can_dump_shader(&sscreen->b, sel->tokens);
+       bool dump = r600_can_dump_shader(&sscreen->b, sel->info.processor);
  
         if (poly_stipple) {
                 tokens = util_pstipple_create_fragment_shader(tokens, NULL,
-                                               SI_POLY_STIPPLE_SAMPLER);
+                                               SI_POLY_STIPPLE_SAMPLER,
+                                               TGSI_FILE_SYSTEM_VALUE);
                 tgsi_scan_shader(tokens, &stipple_shader_info);
         }
  
@@ -4064,9 +4074,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
         if (sel->type != PIPE_SHADER_COMPUTE)
                 shader->dx10_clamp_mode = true;
  
-       if (sel->info.uses_kill)
-               shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
-
         shader->uses_instanceid = sel->info.uses_instanceid;
         bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
@@ -4141,17 +4148,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
         case TGSI_PROCESSOR_FRAGMENT:
                 si_shader_ctx.radeon_bld.load_input = declare_input_fs;
                 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
-
-               switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
-               case TGSI_FS_DEPTH_LAYOUT_GREATER:
-                       shader->db_shader_control |=
-                               S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
-                       break;
-               case TGSI_FS_DEPTH_LAYOUT_LESS:
-                       shader->db_shader_control |=
-                               S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
-                       break;
-               }
                 break;
         default:
                 assert(!"Unsupported shader type");
@@ -4182,12 +4178,21 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
         radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
  
         mod = bld_base->base.gallivm->module;
-       r = si_compile_llvm(sscreen, shader, tm, mod);
+       r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
+                           mod, debug, si_shader_ctx.type);
         if (r) {
                 fprintf(stderr, "LLVM failed to compile shader\n");
                 goto out;
         }
  
+       si_shader_dump(sscreen, shader, debug, si_shader_ctx.type);
+
+       r = si_shader_binary_upload(sscreen, shader);
+       if (r) {
+               fprintf(stderr, "LLVM failed to upload shader\n");
+               goto out;
+       }
+
         radeon_llvm_dispose(&si_shader_ctx.radeon_bld);
  
         if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
@@ -4196,7 +4201,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                 shader->gs_copy_shader->key = shader->key;
                 si_shader_ctx.shader = shader->gs_copy_shader;
                 if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
-                                                   shader, dump))) {
+                                                   shader, dump, debug))) {
                         free(shader->gs_copy_shader);
                         shader->gs_copy_shader = NULL;
                         goto out;
@@ -4211,6 +4216,14 @@ out:
         return r;
  }
  
+void si_shader_destroy_binary(struct radeon_shader_binary *binary)
+{
+       FREE(binary->code);
+       FREE(binary->rodata);
+       FREE(binary->relocs);
+       FREE(binary->disasm_string);
+}
+
  void si_shader_destroy(struct si_shader *shader)
  {
         if (shader->gs_copy_shader) {
@@ -4222,8 +4235,5 @@ void si_shader_destroy(struct si_shader *shader)
                 r600_resource_reference(&shader->scratch_bo, NULL);
  
         r600_resource_reference(&shader->bo, NULL);
-
-       FREE(shader->binary.code);
-       FREE(shader->binary.relocs);
-       FREE(shader->binary.disasm_string);
+       si_shader_destroy_binary(&shader->binary);
  }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h

index b0c8680..1635358 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -76,10 +76,10 @@ struct radeon_shader_binary;
  struct radeon_shader_reloc;
  
  #define SI_SGPR_RW_BUFFERS     0  /* rings (& stream-out, VS only) */
-#define SI_SGPR_CONST          2
-#define SI_SGPR_SAMPLER                4
-#define SI_SGPR_RESOURCE       6
-#define SI_SGPR_VERTEX_BUFFER  8  /* VS only */
+#define SI_SGPR_CONST_BUFFERS  2
+#define SI_SGPR_SAMPLER_STATES 4
+#define SI_SGPR_SAMPLER_VIEWS  6
+#define SI_SGPR_VERTEX_BUFFERS 8  /* VS only */
  #define SI_SGPR_BASE_VERTEX    10 /* VS only */
  #define SI_SGPR_START_INSTANCE 11 /* VS only */
  #define SI_SGPR_VS_STATE_BITS  12 /* VS(VS) only */
@@ -101,12 +101,12 @@ struct radeon_shader_reloc;
  
  /* LLVM function parameter indices */
  #define SI_PARAM_RW_BUFFERS    0
-#define SI_PARAM_CONST         1
-#define SI_PARAM_SAMPLER       2
-#define SI_PARAM_RESOURCE      3
+#define SI_PARAM_CONST_BUFFERS 1
+#define SI_PARAM_SAMPLER_STATES        2
+#define SI_PARAM_SAMPLER_VIEWS 3
  
  /* VS only parameters */
-#define SI_PARAM_VERTEX_BUFFER 4
+#define SI_PARAM_VERTEX_BUFFERS        4
  #define SI_PARAM_BASE_VERTEX   5
  #define SI_PARAM_START_INSTANCE        6
  /* [0] = clamp vertex color */
@@ -201,6 +201,7 @@ struct si_shader_selector {
         bool            forces_persample_interp_for_persp;
         bool            forces_persample_interp_for_linear;
  
+       /* GS parameters. */
         unsigned        esgs_itemsize;
         unsigned        gs_input_verts_per_prim;
         unsigned        gs_output_prim;
@@ -210,6 +211,9 @@ struct si_shader_selector {
         unsigned        gsvs_vertex_size;
         unsigned        max_gsvs_emit_size;
  
+       /* PS parameters. */
+       unsigned        db_shader_control;
+
         /* masks of "get_unique_index" bits */
         uint64_t        outputs_written;
         uint32_t        patch_outputs_written;
@@ -258,6 +262,17 @@ union si_shader_key {
         } tes; /* tessellation evaluation shader */
  };
  
+struct si_shader_config {
+       unsigned                        num_sgprs;
+       unsigned                        num_vgprs;
+       unsigned                        lds_size;
+       unsigned                        spi_ps_input_ena;
+       unsigned                        float_mode;
+       unsigned                        scratch_bytes_per_wave;
+       unsigned                        rsrc1;
+       unsigned                        rsrc2;
+};
+
  struct si_shader {
         struct si_shader_selector       *selector;
         struct si_shader                *next_variant;
@@ -266,18 +281,9 @@ struct si_shader {
         struct si_pm4_state             *pm4;
         struct r600_resource            *bo;
         struct r600_resource            *scratch_bo;
-       struct radeon_shader_binary     binary;
-       unsigned                        num_sgprs;
-       unsigned                        num_vgprs;
-       unsigned                        lds_size;
-       unsigned                        spi_ps_input_ena;
-       unsigned                        float_mode;
-       unsigned                        scratch_bytes_per_wave;
-       unsigned                        spi_shader_col_format;
-       unsigned                        spi_shader_z_format;
-       unsigned                        db_shader_control;
-       unsigned                        cb_shader_mask;
         union si_shader_key             key;
+       struct radeon_shader_binary     binary;
+       struct si_shader_config         config;
  
         unsigned                nparam;
         unsigned                vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
@@ -288,9 +294,6 @@ struct si_shader {
         unsigned                nr_param_exports;
         bool                    is_gs_copy_shader;
         bool                    dx10_clamp_mode; /* convert NaNs to 0 */
-
-       unsigned                rsrc1;
-       unsigned                rsrc2;
  };
  
  static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
@@ -327,19 +330,27 @@ static inline bool si_vs_exports_prim_id(struct si_shader *shader)
  
  /* radeonsi_shader.c */
  int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
-                    struct si_shader *shader);
+                    struct si_shader *shader,
+                    struct pipe_debug_callback *debug);
  void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f);
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-                   LLVMTargetMachineRef tm, LLVMModuleRef mod);
+int si_compile_llvm(struct si_screen *sscreen,
+                   struct radeon_shader_binary *binary,
+                   struct si_shader_config *conf,
+                   LLVMTargetMachineRef tm,
+                   LLVMModuleRef mod,
+                   struct pipe_debug_callback *debug,
+                   unsigned processor);
  void si_shader_destroy(struct si_shader *shader);
+void si_shader_destroy_binary(struct radeon_shader_binary *binary);
  unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
  int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
+void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+                   struct pipe_debug_callback *debug, unsigned processor);
  void si_shader_apply_scratch_relocs(struct si_context *sctx,
                         struct si_shader *shader,
                         uint64_t scratch_va);
-void si_shader_binary_read_config(const struct si_screen *sscreen,
-                               struct si_shader *shader,
-                               unsigned symbol_offset);
+void si_shader_binary_read_config(struct radeon_shader_binary *binary,
+                                 struct si_shader_config *conf,
+                                 unsigned symbol_offset);
  
  #endif
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c

index 4086819..2a6d2c6 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -605,6 +605,10 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
                 (clipdist_mask ? 0 :
                  sctx->queued.named.rasterizer->clip_plane_enable & SIX_BITS) |
                 S_028810_CLIP_DISABLE(window_space));
+
+       /* reuse needs to be set off if we write oViewport */
+       radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF,
+                              S_028AB4_REUSE_OFF(info->writes_viewport_index));
  }
  
  static void si_set_scissor_states(struct pipe_context *ctx,
@@ -3468,7 +3472,6 @@ static void si_init_config(struct si_context *sctx)
         si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
  
         si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
-       si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
         si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
         if (sctx->b.chip_class < CIK)
                 si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c

index e550011..91ccd07 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
         perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
  
         lds_size = output_patch0_offset + output_patch_size * *num_patches;
-       ls_rsrc2 = ls->current->rsrc2;
+       ls_rsrc2 = ls->current->config.rsrc2;
  
         if (sctx->b.chip_class >= CIK) {
                 assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
         if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
                 radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
         radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-       radeon_emit(cs, ls->current->rsrc1);
+       radeon_emit(cs, ls->current->config.rsrc1);
         radeon_emit(cs, ls_rsrc2);
  
         /* Compute userdata SGPRs. */
@@ -818,7 +818,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                         si_get_draw_start_count(sctx, info, &start, &count);
                         start_offset = start * ib.index_size;
  
-                       u_upload_alloc(sctx->b.uploader, start_offset, count * 2,
+                       u_upload_alloc(sctx->b.uploader, start_offset, count * 2, 256,
                                        &out_offset, &out_buffer, &ptr);
                         if (!out_buffer) {
                                 pipe_resource_reference(&ib.buffer, NULL);
@@ -842,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                         start_offset = start * ib.index_size;
  
                         u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size,
-                                     (char*)ib.user_buffer + start_offset,
+                                     256, (char*)ib.user_buffer + start_offset,
                                       &ib.offset, &ib.buffer);
                         if (!ib.buffer)
                                 return;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c

index f0147ce..8ff70b4 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -111,7 +111,7 @@ static void si_shader_ls(struct si_shader *shader)
         vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
  
         num_user_sgprs = SI_LS_NUM_USER_SGPR;
-       num_sgprs = shader->num_sgprs;
+       num_sgprs = shader->config.num_sgprs;
         if (num_user_sgprs > num_sgprs) {
                 /* Last 2 reserved SGPRs are used for VCC */
                 num_sgprs = num_user_sgprs + 2;
@@ -121,12 +121,12 @@ static void si_shader_ls(struct si_shader *shader)
         si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
         si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
  
-       shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+       shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
                            S_00B528_SGPRS((num_sgprs - 1) / 8) |
                            S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
                            S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
-       shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
-                          S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
+       shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+                          S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
  }
  
  static void si_shader_hs(struct si_shader *shader)
@@ -143,7 +143,7 @@ static void si_shader_hs(struct si_shader *shader)
         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
  
         num_user_sgprs = SI_TCS_NUM_USER_SGPR;
-       num_sgprs = shader->num_sgprs;
+       num_sgprs = shader->config.num_sgprs;
         /* One SGPR after user SGPRs is pre-loaded with tessellation factor
          * buffer offset. */
         if ((num_user_sgprs + 1) > num_sgprs) {
@@ -155,12 +155,12 @@ static void si_shader_hs(struct si_shader *shader)
         si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
         si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
         si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
-                      S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
+                      S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
                        S_00B428_SGPRS((num_sgprs - 1) / 8) |
                        S_00B428_DX10_CLAMP(shader->dx10_clamp_mode));
         si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
                        S_00B42C_USER_SGPR(num_user_sgprs) |
-                      S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+                      S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
  }
  
  static void si_shader_es(struct si_shader *shader)
@@ -187,7 +187,7 @@ static void si_shader_es(struct si_shader *shader)
         } else
                 unreachable("invalid shader selector type");
  
-       num_sgprs = shader->num_sgprs;
+       num_sgprs = shader->config.num_sgprs;
         /* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
         if ((num_user_sgprs + 1) > num_sgprs) {
                 /* Last 2 reserved SGPRs are used for VCC */
@@ -200,13 +200,13 @@ static void si_shader_es(struct si_shader *shader)
         si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
         si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
         si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
-                      S_00B328_VGPRS((shader->num_vgprs - 1) / 4) |
+                      S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
                        S_00B328_SGPRS((num_sgprs - 1) / 8) |
                        S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
                        S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
         si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
                        S_00B32C_USER_SGPR(num_user_sgprs) |
-                      S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+                      S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
  
         if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
                 si_set_tesseval_regs(shader, pm4);
@@ -272,7 +272,7 @@ static void si_shader_gs(struct si_shader *shader)
         si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
  
         num_user_sgprs = SI_GS_NUM_USER_SGPR;
-       num_sgprs = shader->num_sgprs;
+       num_sgprs = shader->config.num_sgprs;
         /* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */
         if ((num_user_sgprs + 2) > num_sgprs) {
                 /* Last 2 reserved SGPRs are used for VCC */
@@ -281,12 +281,12 @@ static void si_shader_gs(struct si_shader *shader)
         assert(num_sgprs <= 104);
  
         si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-                      S_00B228_VGPRS((shader->num_vgprs - 1) / 4) |
+                      S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
                        S_00B228_SGPRS((num_sgprs - 1) / 8) |
                        S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
         si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
                        S_00B22C_USER_SGPR(num_user_sgprs) |
-                      S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+                      S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
  }
  
  static void si_shader_vs(struct si_shader *shader)
@@ -329,7 +329,7 @@ static void si_shader_vs(struct si_shader *shader)
         } else
                 unreachable("invalid shader selector type");
  
-       num_sgprs = shader->num_sgprs;
+       num_sgprs = shader->config.num_sgprs;
         if (num_user_sgprs > num_sgprs) {
                 /* Last 2 reserved SGPRs are used for VCC */
                 num_sgprs = num_user_sgprs + 2;
@@ -356,7 +356,7 @@ static void si_shader_vs(struct si_shader *shader)
         si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
         si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40);
         si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS,
-                      S_00B128_VGPRS((shader->num_vgprs - 1) / 4) |
+                      S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) |
                        S_00B128_SGPRS((num_sgprs - 1) / 8) |
                        S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
                        S_00B128_DX10_CLAMP(shader->dx10_clamp_mode));
@@ -367,7 +367,7 @@ static void si_shader_vs(struct si_shader *shader)
                        S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
                        S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
                        S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
-                      S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+                      S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
         if (window_space)
                 si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
                                S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
@@ -387,8 +387,10 @@ static void si_shader_ps(struct si_shader *shader)
         struct tgsi_shader_info *info = &shader->selector->info;
         struct si_pm4_state *pm4;
         unsigned i, spi_ps_in_control;
+       unsigned spi_shader_col_format = 0, cb_shader_mask = 0;
+       unsigned colors_written, export_16bpc;
         unsigned num_sgprs, num_user_sgprs;
-       unsigned spi_baryc_cntl = 0;
+       unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
         uint64_t va;
         bool has_centroid;
  
@@ -397,44 +399,67 @@ static void si_shader_ps(struct si_shader *shader)
         if (!pm4)
                 return;
  
-       for (i = 0; i < info->num_inputs; i++) {
-               switch (info->input_semantic_name[i]) {
-               case TGSI_SEMANTIC_POSITION:
-                       /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
-                        * Possible vaules:
-                        * 0 -> Position = pixel center (default)
-                        * 1 -> Position = pixel centroid
-                        * 2 -> Position = at sample position
-                        */
-                       switch (info->input_interpolate_loc[i]) {
-                       case TGSI_INTERPOLATE_LOC_CENTROID:
-                               spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(1);
-                               break;
-                       case TGSI_INTERPOLATE_LOC_SAMPLE:
-                               spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
-                               break;
-                       }
+       /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
+        * Possible vaules:
+        * 0 -> Position = pixel center
+        * 1 -> Position = pixel centroid
+        * 2 -> Position = at sample position
+        *
+        * From GLSL 4.5 specification, section 7.1:
+        *   "The variable gl_FragCoord is available as an input variable from
+        *    within fragment shaders and it holds the window relative coordinates
+        *    (x, y, z, 1/w) values for the fragment. If multi-sampling, this
+        *    value can be for any location within the pixel, or one of the
+        *    fragment samples. The use of centroid does not further restrict
+        *    this value to be inside the current primitive."
+        *
+        * Meaning that centroid has no effect and we can return anything within
+        * the pixel. Thus, return the value at sample position, because that's
+        * the most accurate one shaders can get.
+        */
+       spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
  
-                       if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
-                           TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
-                               spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
-                       break;
-               }
+       if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
+           TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
+               spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
+
+       /* Find out what SPI_SHADER_COL_FORMAT and CB_SHADER_MASK should be. */
+       colors_written = info->colors_written;
+       export_16bpc = shader->key.ps.export_16bpc;
+
+       if (info->colors_written == 0x1 &&
+           info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
+               colors_written |= (1 << (shader->key.ps.last_cbuf + 1)) - 1;
+       }
+
+       while (colors_written) {
+               i = u_bit_scan(&colors_written);
+               if (export_16bpc & (1 << i))
+                       spi_shader_col_format |= V_028714_SPI_SHADER_FP16_ABGR << (4 * i);
+               else
+                       spi_shader_col_format |= V_028714_SPI_SHADER_32_ABGR << (4 * i);
+               cb_shader_mask |= 0xf << (4 * i);
         }
  
-       has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) ||
-                      G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena);
+       /* Set interpolation controls. */
+       has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
+                      G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
  
         spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
                             S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
  
+       /* Set registers. */
         si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
         si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control);
  
-       si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT, shader->spi_shader_z_format);
-       si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT,
-                      shader->spi_shader_col_format);
-       si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, shader->cb_shader_mask);
+       si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT,
+                      info->writes_samplemask ? V_028710_SPI_SHADER_32_ABGR :
+                      info->writes_stencil ? V_028710_SPI_SHADER_32_GR :
+                      info->writes_z ? V_028710_SPI_SHADER_32_R :
+                      V_028710_SPI_SHADER_ZERO);
+
+       si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT, spi_shader_col_format);
+       si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, cb_shader_mask);
  
         va = shader->bo->gpu_address;
         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
@@ -442,7 +467,7 @@ static void si_shader_ps(struct si_shader *shader)
         si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40);
  
         num_user_sgprs = SI_PS_NUM_USER_SGPR;
-       num_sgprs = shader->num_sgprs;
+       num_sgprs = shader->config.num_sgprs;
         /* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */
         if ((num_user_sgprs + 1) > num_sgprs) {
                 /* Last 2 reserved SGPRs are used for VCC */
@@ -451,13 +476,13 @@ static void si_shader_ps(struct si_shader *shader)
         assert(num_sgprs <= 104);
  
         si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
-                      S_00B028_VGPRS((shader->num_vgprs - 1) / 4) |
+                      S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) |
                        S_00B028_SGPRS((num_sgprs - 1) / 8) |
                        S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
         si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
-                      S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
+                      S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
                        S_00B02C_USER_SGPR(num_user_sgprs) |
-                      S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+                      S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
  }
  
  static void si_shader_init_pm4_state(struct si_shader *shader)
@@ -496,6 +521,16 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
         }
  }
  
+static unsigned si_get_alpha_test_func(struct si_context *sctx)
+{
+       /* Alpha-test should be disabled if colorbuffer 0 is integer. */
+       if (sctx->queued.named.dsa &&
+           !sctx->framebuffer.cb0_is_integer)
+               return sctx->queued.named.dsa->alpha_func;
+
+       return PIPE_FUNC_ALWAYS;
+}
+
  /* Compute the key for the hw shader variant */
  static inline void si_shader_selector_key(struct pipe_context *ctx,
                                           struct si_shader_selector *sel,
@@ -537,8 +572,10 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
         case PIPE_SHADER_FRAGMENT: {
                 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
  
-               if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+               if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+                   sel->info.colors_written == 0x1)
                         key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+
                 key->ps.export_16bpc = sctx->framebuffer.export_16bpc;
  
                 if (rs) {
@@ -562,11 +599,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
                         key->ps.clamp_color = rs->clamp_fragment_color;
                 }
  
-               key->ps.alpha_func = PIPE_FUNC_ALWAYS;
-               /* Alpha-test should be disabled if colorbuffer 0 is integer. */
-               if (sctx->queued.named.dsa &&
-                   !sctx->framebuffer.cb0_is_integer)
-                       key->ps.alpha_func = sctx->queued.named.dsa->alpha_func;
+               key->ps.alpha_func = si_get_alpha_test_func(sctx);
                 break;
         }
         default:
@@ -616,7 +649,7 @@ static int si_shader_select(struct pipe_context *ctx,
         shader->selector = sel;
         shader->key = key;
  
-       r = si_shader_create(sctx->screen, sctx->tm, shader);
+       r = si_shader_create(sctx->screen, sctx->tm, shader, &sctx->b.debug);
         if (unlikely(r)) {
                 R600_ERR("Failed to build shader variant (type=%u) %d\n",
                          sel->type, r);
@@ -634,7 +667,6 @@ static int si_shader_select(struct pipe_context *ctx,
                 sel->last_variant = shader;
         }
         state->current = shader;
-       p_atomic_inc(&sctx->screen->b.num_compilations);
         pipe_mutex_unlock(sel->mutex);
         return 0;
  }
@@ -732,6 +764,25 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                 break;
         }
  
+       /* DB_SHADER_CONTROL */
+       sel->db_shader_control =
+               S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
+               S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
+               S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
+               S_02880C_KILL_ENABLE(sel->info.uses_kill);
+
+       switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
+       case TGSI_FS_DEPTH_LAYOUT_GREATER:
+               sel->db_shader_control |=
+                       S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+               break;
+       case TGSI_FS_DEPTH_LAYOUT_LESS:
+               sel->db_shader_control |=
+                       S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+               break;
+       }
+
+       /* Pre-compilation. */
         if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
                 struct si_shader_ctx_state state = {sel};
  
@@ -928,12 +979,6 @@ static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
                 unsigned index = psinfo->input_semantic_index[i];
                 unsigned interpolate = psinfo->input_interpolate[i];
                 unsigned param_offset = ps->ps_input_param_offset[i];
-
-               if (name == TGSI_SEMANTIC_POSITION ||
-                   name == TGSI_SEMANTIC_FACE)
-                       /* Read from preloaded VGPRs, not parameters */
-                       continue;
-
  bcolor:
                 tmp = 0;
  
@@ -988,7 +1033,7 @@ static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom
         if (!ps)
                 return;
  
-       input_ena = ps->spi_ps_input_ena;
+       input_ena = ps->config.spi_ps_input_ena;
  
         /* we need to enable at least one of them, otherwise we hang the GPU */
         assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
@@ -1217,7 +1262,7 @@ static int si_update_scratch_buffer(struct si_context *sctx,
                 return 0;
  
         /* This shader doesn't need a scratch buffer */
-       if (shader->scratch_bytes_per_wave == 0)
+       if (shader->config.scratch_bytes_per_wave == 0)
                 return 0;
  
         /* This shader is already configured to use the current
@@ -1249,7 +1294,7 @@ static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
  
  static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
  {
-       return shader ? shader->scratch_bytes_per_wave : 0;
+       return shader ? shader->config.scratch_bytes_per_wave : 0;
  }
  
  static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
@@ -1272,6 +1317,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
                 si_get_max_scratch_bytes_per_wave(sctx);
         unsigned scratch_needed_size = scratch_bytes_per_wave *
                 sctx->scratch_waves;
+       unsigned spi_tmpring_size;
         int r;
  
         if (scratch_needed_size > 0) {
@@ -1341,8 +1387,12 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
         assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
                 "scratch size should already be aligned correctly.");
  
-       sctx->spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
-                               S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
+       spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+                          S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
+       if (spi_tmpring_size != sctx->spi_tmpring_size) {
+               sctx->spi_tmpring_size = spi_tmpring_size;
+               sctx->emit_scratch_reloc = true;
+       }
         return true;
  }
  
@@ -1550,6 +1600,10 @@ bool si_update_shaders(struct si_context *sctx)
         si_update_vgt_shader_config(sctx);
  
         if (sctx->ps_shader.cso) {
+               unsigned db_shader_control =
+                       sctx->ps_shader.cso->db_shader_control |
+                       S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
+
                 r = si_shader_select(ctx, &sctx->ps_shader);
                 if (r)
                         return false;
@@ -1569,8 +1623,8 @@ bool si_update_shaders(struct si_context *sctx)
                         si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
                 }
  
-               if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) {
-                       sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control;
+               if (sctx->ps_db_shader_control != db_shader_control) {
+                       sctx->ps_db_shader_control = db_shader_control;
                         si_mark_atom_dirty(sctx, &sctx->db_render_state);
                 }
  
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h

index 8e5e242..d5c4aaa 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -37,6 +37,7 @@
  #include "draw/draw_vertex.h"
  
  #include "sp_quad_pipe.h"
+#include "sp_setup.h"
  
  
  /** Do polygon stipple in the draw module? */
@@ -117,17 +118,17 @@ struct softpipe_context {
     unsigned const_buffer_size[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
  
     /** Vertex format */
+   struct sp_setup_info setup_info;
     struct vertex_info vertex_info;
-   struct vertex_info vertex_info_vbuf;
  
     /** Which vertex shader output slot contains point size */
-   int psize_slot;
+   int8_t psize_slot;
  
     /** Which vertex shader output slot contains viewport index */
-   int viewport_index_slot;
+   int8_t viewport_index_slot;
  
     /** Which vertex shader output slot contains layer */
-   int layer_slot;
+   int8_t layer_slot;
  
     /** The reduced version of the primitive supplied by the state tracker */
     unsigned reduced_api_prim;
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c

index f8a3eac..95d1ac1 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -161,7 +161,7 @@ sp_vbuf_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
  {
     struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
     struct softpipe_context *softpipe = cvbr->softpipe;
-   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const unsigned stride = softpipe->vertex_info.size * sizeof(float);
     const void *vertex_buffer = cvbr->vertex_buffer;
     struct setup_context *setup = cvbr->setup;
     const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
@@ -358,7 +358,7 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
     struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
     struct softpipe_context *softpipe = cvbr->softpipe;
     struct setup_context *setup = cvbr->setup;
-   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const unsigned stride = softpipe->vertex_info.size * sizeof(float);
     const void *vertex_buffer =
        (void *) get_vert(cvbr->vertex_buffer, start, stride);
     const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c

index 76105b4..c28d28d 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -223,7 +223,7 @@ softpipe_get_query_result(struct pipe_context *pipe,
        break;
     case PIPE_QUERY_PIPELINE_STATISTICS:
        memcpy(vresult, &sq->stats,
-             sizeof(struct pipe_query_data_pipeline_statistics));;
+             sizeof(struct pipe_query_data_pipeline_statistics));
        break;
     case PIPE_QUERY_GPU_FINISHED:
        vresult->b = TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c

index 9939720..143702a 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -251,6 +251,15 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
     case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
        return 0;
     }
     /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c

index ac2d978..ffe4926 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -38,7 +38,6 @@
  #include "sp_setup.h"
  #include "sp_state.h"
  #include "draw/draw_context.h"
-#include "draw/draw_vertex.h"
  #include "pipe/p_shader_tokens.h"
  #include "util/u_math.h"
  #include "util/u_memory.h"
@@ -599,10 +598,12 @@ setup_tri_coefficients(struct setup_context *setup)
  {
     struct softpipe_context *softpipe = setup->softpipe;
     const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info;
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
     uint fragSlot;
     float v[3];
  
+   assert(sinfo->valid);
+
     /* z and w are done by linear interpolation:
      */
     v[0] = setup->vmin[0][2];
@@ -618,15 +619,16 @@ setup_tri_coefficients(struct setup_context *setup)
     /* setup interpolation for all the remaining attributes:
      */
     for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
        uint j;
  
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         for (j = 0; j < TGSI_NUM_CHANNELS; j++)
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
+         for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
              const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         }
           break;
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
           for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
              tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                         setup->vmid[vertSlot][j],
@@ -636,7 +638,7 @@ setup_tri_coefficients(struct setup_context *setup)
              tri_linear_coeff(setup, &setup->coef[fragSlot], j, v);
           }
           break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
           for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
              tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                         setup->vmid[vertSlot][j],
@@ -646,7 +648,7 @@ setup_tri_coefficients(struct setup_context *setup)
              tri_persp_coeff(setup, &setup->coef[fragSlot], j, v);
           }
           break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
           setup_fragcoord_coeff(setup, fragSlot);
           break;
        default:
@@ -966,11 +968,13 @@ setup_line_coefficients(struct setup_context *setup,
  {
     struct softpipe_context *softpipe = setup->softpipe;
     const struct tgsi_shader_info *fsInfo = &setup->softpipe->fs_variant->info;
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
     uint fragSlot;
     float area;
     float v[2];
  
+   assert(sinfo->valid);
+
     /* use setup->vmin, vmax to point to vertices */
     if (softpipe->rasterizer->flatshade_first)
        setup->vprovoke = v0;
@@ -1001,15 +1005,15 @@ setup_line_coefficients(struct setup_context *setup,
     /* setup interpolation for all the remaining attributes:
      */
     for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
        uint j;
  
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
           for (j = 0; j < TGSI_NUM_CHANNELS; j++)
              const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
           break;
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
           for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
              line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                          setup->vmax[vertSlot][j],
@@ -1018,7 +1022,7 @@ setup_line_coefficients(struct setup_context *setup,
              line_linear_coeff(setup, &setup->coef[fragSlot], j, v);
           }
           break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
           for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
              line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
                                          setup->vmax[vertSlot][j],
@@ -1027,7 +1031,7 @@ setup_line_coefficients(struct setup_context *setup,
              line_persp_coeff(setup, &setup->coef[fragSlot], j, v);
           }
           break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
           setup_fragcoord_coeff(setup, fragSlot);
           break;
        default:
@@ -1236,7 +1240,7 @@ sp_setup_point(struct setup_context *setup,
     const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth;
     const float x = v0[0][0];  /* Note: data[0] is always position */
     const float y = v0[0][1];
-   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   const struct sp_setup_info *sinfo = &softpipe->setup_info;
     uint fragSlot;
     uint layer = 0;
     unsigned viewport_index = 0;
@@ -1245,6 +1249,8 @@ sp_setup_point(struct setup_context *setup,
     print_vertex(setup, v0);
  #endif
  
+   assert(sinfo->valid);
+
     if (setup->softpipe->no_rast || setup->softpipe->rasterizer->rasterizer_discard)
        return;
  
@@ -1285,22 +1291,22 @@ sp_setup_point(struct setup_context *setup,
     const_coeff(setup, &setup->posCoef, 0, 3);
  
     for (fragSlot = 0; fragSlot < fsInfo->num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      const uint vertSlot = sinfo->attrib[fragSlot].src_index;
        uint j;
  
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
+      switch (sinfo->attrib[fragSlot].interp) {
+      case SP_INTERP_CONSTANT:
           /* fall-through */
-      case INTERP_LINEAR:
+      case SP_INTERP_LINEAR:
           for (j = 0; j < TGSI_NUM_CHANNELS; j++)
              const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
           break;
-      case INTERP_PERSPECTIVE:
+      case SP_INTERP_PERSPECTIVE:
           for (j = 0; j < TGSI_NUM_CHANNELS; j++)
              point_persp_coeff(setup, setup->vprovoke,
                                &setup->coef[fragSlot], vertSlot, j);
           break;
-      case INTERP_POS:
+      case SP_INTERP_POS:
           setup_fragcoord_coeff(setup, fragSlot);
           break;
        default:
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h

index 191494a..a54dc5d 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_setup.h
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -30,11 +30,30 @@
  struct setup_context;
  struct softpipe_context;
  
+/**
+ * Attribute interpolation mode
+ */
+enum sp_interp_mode {
+   SP_INTERP_POS,       /**< special case for frag position */
+   SP_INTERP_CONSTANT,
+   SP_INTERP_LINEAR,
+   SP_INTERP_PERSPECTIVE
+};
+
+
+struct sp_setup_info {
+   unsigned valid;
+   struct {
+      unsigned interp:8;      /**< SP_INTERP_X */
+      int src_index:8;
+   } attrib[PIPE_MAX_SHADER_OUTPUTS];
+};
+
  void 
-sp_setup_tri( struct setup_context *setup,
-          const float (*v0)[4],
-          const float (*v1)[4],
-          const float (*v2)[4] );
+sp_setup_tri(struct setup_context *setup,
+             const float (*v0)[4],
+             const float (*v1)[4],
+             const float (*v2)[4]);
  
  void
  sp_setup_line(struct setup_context *setup,
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h

index c35534c..16a2897 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -175,9 +175,6 @@ softpipe_unmap_texture_surfaces(struct softpipe_context *sp);
  
  
  struct vertex_info *
-softpipe_get_vertex_info(struct softpipe_context *softpipe);
-
-struct vertex_info *
  softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe);
  
  
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c

index 7e998af..d4d03f1 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -48,7 +48,7 @@
  static void
  invalidate_vertex_layout(struct softpipe_context *softpipe)
  {
-   softpipe->vertex_info.num_attribs =  0;
+   softpipe->setup_info.valid =  0;
  }
  
  
@@ -57,50 +57,64 @@ invalidate_vertex_layout(struct softpipe_context *softpipe)
   * (simple float[][4]) used by the 'draw' module into vertices for
   * rasterization.
   *
- * This function validates the vertex layout and returns a pointer to a
- * vertex_info object.
+ * This function validates the vertex layout.
   */
-struct vertex_info *
-softpipe_get_vertex_info(struct softpipe_context *softpipe)
+static void
+softpipe_compute_vertex_info(struct softpipe_context *softpipe)
  {
-   struct vertex_info *vinfo = &softpipe->vertex_info;
-   int vs_index;
+   struct sp_setup_info *sinfo = &softpipe->setup_info;
  
-   if (vinfo->num_attribs == 0) {
-      /* compute vertex layout now */
+   if (sinfo->valid == 0) {
        const struct tgsi_shader_info *fsInfo = &softpipe->fs_variant->info;
-      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-      const uint num = draw_num_shader_outputs(softpipe->draw);
+      struct vertex_info *vinfo = &softpipe->vertex_info;
        uint i;
-
-      /* Tell draw_vbuf to simply emit the whole post-xform vertex
-       * as-is.  No longer any need to try and emit draw vertex_header
-       * info.
+      int vs_index;
+      /*
+       * This doesn't quite work right (wrt face injection, prim id,
+       * wide points) - hit a couple assertions, misrenderings plus
+       * memory corruption. Albeit could fix (the former two) by calling
+       * this "more often" (rasterizer changes etc.). (The latter would
+       * need to be included in draw_prepare_shader_outputs, but it looks
+       * like that would potentially allocate quite some unused additional
+       * vertex outputs.)
+       * draw_prepare_shader_outputs(softpipe->draw);
         */
-      vinfo_vbuf->num_attribs = 0;
-      for (i = 0; i < num; i++) {
-        draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-      }
-      draw_compute_vertex_size(vinfo_vbuf);
  
        /*
-       * Loop over fragment shader inputs, searching for the matching output
-       * from the vertex shader.
+       * Those can't actually be 0 (because pos is always at 0).
+       * But use ints anyway to avoid confusion (in vs outputs, they
+       * can very well be at pos 0).
         */
+      softpipe->viewport_index_slot = -1;
+      softpipe->layer_slot = -1;
+      softpipe->psize_slot = -1;
+
        vinfo->num_attribs = 0;
+
+      /*
+       * Put position always first (setup needs it there).
+       */
+      vs_index = draw_find_shader_output(softpipe->draw,
+                                         TGSI_SEMANTIC_POSITION, 0);
+
+      draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+
+      /*
+       * Match FS inputs against VS outputs, emitting the necessary
+       * attributes.
+       */
        for (i = 0; i < fsInfo->num_inputs; i++) {
-         int src;
-         enum interp_mode interp = INTERP_LINEAR;
+         enum sp_interp_mode interp = SP_INTERP_LINEAR;
  
           switch (fsInfo->input_interpolate[i]) {
           case TGSI_INTERPOLATE_CONSTANT:
-            interp = INTERP_CONSTANT;
+            interp = SP_INTERP_CONSTANT;
              break;
           case TGSI_INTERPOLATE_LINEAR:
-            interp = INTERP_LINEAR;
+            interp = SP_INTERP_LINEAR;
              break;
           case TGSI_INTERPOLATE_PERSPECTIVE:
-            interp = INTERP_PERSPECTIVE;
+            interp = SP_INTERP_PERSPECTIVE;
              break;
           case TGSI_INTERPOLATE_COLOR:
              assert(fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR);
@@ -111,88 +125,121 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
  
           switch (fsInfo->input_semantic_name[i]) {
           case TGSI_SEMANTIC_POSITION:
-            interp = INTERP_POS;
+            interp = SP_INTERP_POS;
              break;
  
           case TGSI_SEMANTIC_COLOR:
              if (fsInfo->input_interpolate[i] == TGSI_INTERPOLATE_COLOR) {
                 if (softpipe->rasterizer->flatshade)
-                  interp = INTERP_CONSTANT;
+                  interp = SP_INTERP_CONSTANT;
                 else
-                  interp = INTERP_PERSPECTIVE;
+                  interp = SP_INTERP_PERSPECTIVE;
              }
              break;
           }
  
-         /* this includes texcoords and varying vars */
-         src = draw_find_shader_output(softpipe->draw,
-                                       fsInfo->input_semantic_name[i],
-                                       fsInfo->input_semantic_index[i]);
-        if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR && src == -1)
-          /* try and find a bcolor */
-          src = draw_find_shader_output(softpipe->draw,
-                                        TGSI_SEMANTIC_BCOLOR, fsInfo->input_semantic_index[i]);
+         /*
+          * Search for each input in current vs output:
+          */
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            fsInfo->input_semantic_name[i],
+                                            fsInfo->input_semantic_index[i]);
+
+         if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+             vs_index == -1) {
+            /*
+             * try and find a bcolor.
+             * Note that if there's both front and back color, draw will
+             * have copied back to front color already.
+             */
+            vs_index = draw_find_shader_output(softpipe->draw,
+                                               TGSI_SEMANTIC_BCOLOR,
+                                               fsInfo->input_semantic_index[i]);
+         }
  
-         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
+         sinfo->attrib[i].interp = interp;
+         /* extremely pointless index map */
+         sinfo->attrib[i].src_index = i + 1;
+         /*
+          * For vp index and layer, if the fs requires them but the vs doesn't
+          * provide them, draw (vbuf) will give us the required 0 (slot -1).
+          * (This means in this case we'll also use those slots in setup, which
+          * isn't necessary but they'll contain the correct (0) value.)
+          */
+         if (fsInfo->input_semantic_name[i] ==
+                    TGSI_SEMANTIC_VIEWPORT_INDEX) {
+            softpipe->viewport_index_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         } else if (fsInfo->input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+            softpipe->layer_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+            /*
+             * Note that we'd actually want to skip position (as we won't use
+             * the attribute in the fs) but can't. The reason is that we don't
+             * actually have a input/output map for setup (even though it looks
+             * like we do...). Could adjust for this though even without a map.
+             */
+         } else {
+            /*
+             * Note that we'd actually want to skip position (as we won't use
+             * the attribute in the fs) but can't. The reason is that we don't
+             * actually have a input/output map for setup (even though it looks
+             * like we do...). Could adjust for this though even without a map.
+             */
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
        }
  
-      /* Figure out if we need pointsize as well. */
+      /* Figure out if we need pointsize as well.
+       */
        vs_index = draw_find_shader_output(softpipe->draw,
                                           TGSI_SEMANTIC_PSIZE, 0);
  
        if (vs_index >= 0) {
-         softpipe->psize_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+         softpipe->psize_slot = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
        }
  
-      /* Figure out if we need viewport index */
-      vs_index = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_VIEWPORT_INDEX,
-                                         0);
-      if (vs_index >= 0) {
-         softpipe->viewport_index_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else {
-         softpipe->viewport_index_slot = 0;
+      /* Figure out if we need viewport index (if it wasn't already in fs input) */
+      if (softpipe->viewport_index_slot < 0) {
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            TGSI_SEMANTIC_VIEWPORT_INDEX,
+                                            0);
+         if (vs_index >= 0) {
+            softpipe->viewport_index_slot =(int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
        }
  
-      /* Figure out if we need layer */
-      vs_index = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_LAYER,
-                                         0);
-      if (vs_index >= 0) {
-         softpipe->layer_slot = vinfo->num_attribs;
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-      } else {
-         softpipe->layer_slot = 0;
+      /* Figure out if we need layer (if it wasn't already in fs input) */
+      if (softpipe->layer_slot < 0) {
+         vs_index = draw_find_shader_output(softpipe->draw,
+                                            TGSI_SEMANTIC_LAYER,
+                                            0);
+         if (vs_index >= 0) {
+            softpipe->layer_slot = (int)vinfo->num_attribs;
+            draw_emit_vertex_attr(vinfo, EMIT_4F, vs_index);
+         }
        }
  
        draw_compute_vertex_size(vinfo);
+      softpipe->setup_info.valid = 1;
     }
-
-   return vinfo;
+   return;
  }
  
  
  /**
   * Called from vbuf module.
   *
- * Note that there's actually two different vertex layouts in softpipe.
- *
- * The normal one is computed in softpipe_get_vertex_info() above and is
- * used by the point/line/tri "setup" code.
- *
- * The other one (this one) is only used by the vbuf module (which is
- * not normally used by default but used in testing).  For the vbuf module,
- * we basically want to pass-through the draw module's vertex layout as-is.
- * When the softpipe vbuf code begins drawing, the normal vertex layout
- * will come into play again.
+ * This will trigger validation of the vertex layout (and also compute
+ * the required information for setup).
   */
  struct vertex_info *
  softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
  {
-   (void) softpipe_get_vertex_info(softpipe);
-   return &softpipe->vertex_info_vbuf;
+   softpipe_compute_vertex_info(softpipe);
+   return &softpipe->vertex_info;
  }
  
  
diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c

index dce0404..f0d66a5 100644 (file)
--- a/src/gallium/drivers/softpipe/sp_state_shader.c
+++ b/src/gallium/drivers/softpipe/sp_state_shader.c
@@ -64,7 +64,8 @@ create_fs_variant(struct softpipe_context *softpipe,
           /* get new shader that implements polygon stippling */
           var->tokens = 
              util_pstipple_create_fragment_shader(curfs->tokens,
-                                                 &var->stipple_sampler_unit, 0);
+                                                 &var->stipple_sampler_unit, 0,
+                                                 TGSI_FILE_INPUT);
        }
        else
  #endif
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c

index 10442cb..e45b3e7 100644 (file)
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -337,7 +337,7 @@ SVGA3D_DefineSurface2D(struct svga_winsys_context *swc,    // IN
     mipSizes[0].height = height;
     mipSizes[0].depth = 1;
  
-   swc->commit(swc);;
+   swc->commit(swc);
  
     return PIPE_OK;
  }
@@ -372,7 +372,7 @@ SVGA3D_DestroySurface(struct svga_winsys_context *swc,
     
     swc->surface_relocation(swc, &cmd->sid, NULL, sid,
                             SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
-   swc->commit(swc);;
+   swc->commit(swc);
  
     return PIPE_OK;
  }
@@ -473,6 +473,7 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
     pSuffix->flags = flags;
  
     swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
  
     return PIPE_OK;
  }
@@ -543,6 +544,7 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc,
     pSuffix->flags = flags;
  
     swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
  
     return PIPE_OK;
  }
@@ -1016,7 +1018,7 @@ SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
     *decls = declArray;
     *ranges = rangeArray;
  
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
  
     return PIPE_OK;
  }
@@ -1720,6 +1722,7 @@ SVGA3D_UpdateGBImage(struct svga_winsys_context *swc,
     cmd->box = *box;
  
     swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
  
     return PIPE_OK;
  }
@@ -1746,6 +1749,7 @@ SVGA3D_UpdateGBSurface(struct svga_winsys_context *swc,
                             SVGA_RELOC_WRITE | SVGA_RELOC_INTERNAL);
  
     swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
  
     return PIPE_OK;
  }
@@ -1775,6 +1779,7 @@ SVGA3D_ReadbackGBImage(struct svga_winsys_context *swc,
     cmd->image.mipmap = mipLevel;
  
     swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
  
     return PIPE_OK;
  }
@@ -1801,6 +1806,7 @@ SVGA3D_ReadbackGBSurface(struct svga_winsys_context *swc,
                             SVGA_RELOC_READ | SVGA_RELOC_INTERNAL);
  
     swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
  
     return PIPE_OK;
  }
@@ -1829,6 +1835,7 @@ SVGA3D_ReadbackGBImagePartial(struct svga_winsys_context *swc,
     cmd->invertBox = invertBox;
  
     swc->commit(swc);
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
  
     return PIPE_OK;
  }
diff --git a/src/gallium/drivers/svga/svga_cmd.h b/src/gallium/drivers/svga/svga_cmd.h

index 271ee8e..26e4690 100644 (file)
--- a/src/gallium/drivers/svga/svga_cmd.h
+++ b/src/gallium/drivers/svga/svga_cmd.h
@@ -638,4 +638,8 @@ SVGA3D_vgpu10_UpdateSubResource(struct svga_winsys_context *swc,
                                  const SVGA3dBox *box,
                                  unsigned subResource);
  
+enum pipe_error
+SVGA3D_vgpu10_GenMips(struct svga_winsys_context *swc,
+                      const SVGA3dShaderResourceViewId shaderResourceViewId,
+                      struct svga_winsys_surface *view);
  #endif /* __SVGA3D_H__ */
diff --git a/src/gallium/drivers/svga/svga_cmd_vgpu10.c b/src/gallium/drivers/svga/svga_cmd_vgpu10.c

index 5c12108..99c9add 100644 (file)
--- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
@@ -535,7 +535,7 @@ SVGA3D_vgpu10_Draw(struct svga_winsys_context *swc,
  
     SVGA3D_COPY_BASIC_2(vertexCount, startVertexLocation);
  
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
     swc->commit(swc);
     return PIPE_OK;
  }
@@ -551,7 +551,7 @@ SVGA3D_vgpu10_DrawIndexed(struct svga_winsys_context *swc,
     SVGA3D_COPY_BASIC_3(indexCount, startIndexLocation,
                         baseVertexLocation);
  
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
     swc->commit(swc);
     return PIPE_OK;
  }
@@ -568,7 +568,7 @@ SVGA3D_vgpu10_DrawInstanced(struct svga_winsys_context *swc,
     SVGA3D_COPY_BASIC_4(vertexCountPerInstance, instanceCount,
                         startVertexLocation, startInstanceLocation);
  
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
     swc->commit(swc);
     return PIPE_OK;
  }
@@ -588,7 +588,7 @@ SVGA3D_vgpu10_DrawIndexedInstanced(struct svga_winsys_context *swc,
                         startInstanceLocation);
  
  
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
     swc->commit(swc);
     return PIPE_OK;
  }
@@ -598,7 +598,7 @@ SVGA3D_vgpu10_DrawAuto(struct svga_winsys_context *swc)
  {
     SVGA3D_CREATE_COMMAND(DrawAuto, DRAW_AUTO);
  
-   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
     swc->commit(swc);
     return PIPE_OK;
  }
@@ -1293,3 +1293,24 @@ SVGA3D_vgpu10_UpdateSubResource(struct svga_winsys_context *swc,
     swc->commit(swc);
     return PIPE_OK;
  }
+
+enum pipe_error
+SVGA3D_vgpu10_GenMips(struct svga_winsys_context *swc,
+                      SVGA3dShaderResourceViewId shaderResourceViewId,
+                      struct svga_winsys_surface *view)
+{
+   SVGA3dCmdDXGenMips *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc, SVGA_3D_CMD_DX_GENMIPS,
+                            sizeof(SVGA3dCmdDXGenMips), 1);
+
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->shaderResourceViewId, NULL, view,
+                           SVGA_RELOC_WRITE);
+   cmd->shaderResourceViewId = shaderResourceViewId;
+
+   swc->commit(swc);
+   return PIPE_OK;
+}
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c

index d407785..b10eb45 100644 (file)
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -46,7 +46,6 @@
  #include "svga_winsys.h"
  
  #define CONST0_UPLOAD_DEFAULT_SIZE 65536
-#define CONST0_UPLOAD_ALIGNMENT 256
  
  DEBUG_GET_ONCE_BOOL_OPTION(no_swtnl, "SVGA_NO_SWTNL", FALSE)
  DEBUG_GET_ONCE_BOOL_OPTION(force_swtnl, "SVGA_FORCE_SWTNL", FALSE);
@@ -220,8 +219,8 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen,
  
     svga->const0_upload = u_upload_create(&svga->pipe,
                                           CONST0_UPLOAD_DEFAULT_SIZE,
-                                         CONST0_UPLOAD_ALIGNMENT,
-                                         PIPE_BIND_CONSTANT_BUFFER);
+                                         PIPE_BIND_CONSTANT_BUFFER,
+                                         PIPE_USAGE_STREAM);
     if (!svga->const0_upload)
        goto cleanup;
  
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h

index 78e346a..f1a2041 100644 (file)
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -59,8 +59,9 @@
  #define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 9)
  #define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 10)
  #define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 12)
  /*SVGA_QUERY_MAX has to be last because it is size of an array*/
-#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 13)
  
  /**
   * Maximum supported number of constant buffers per shader
@@ -74,6 +75,8 @@
   */
  #define SVGA_MAX_CONST_BUF_SIZE (4096 * 4 * sizeof(int))
  
+#define CONST0_UPLOAD_ALIGNMENT 256
+
  struct draw_vertex_shader;
  struct draw_fragment_shader;
  struct svga_shader_variant;
@@ -312,7 +315,7 @@ struct svga_hw_view_state
     struct svga_sampler_view *v;
     unsigned min_lod;
     unsigned max_lod;
-   int dirty;
+   boolean dirty;
  };
  
  /* Updated by calling svga_update_state( SVGA_STATE_HW_DRAW )
@@ -343,6 +346,11 @@ struct svga_hw_draw_state
     SVGA3dElementLayoutId layout_id;
     SVGA3dPrimitiveType topology;
  
+   /** Vertex buffer state */
+   SVGA3dVertexBuffer vbuffers[PIPE_MAX_ATTRIBS];
+   struct svga_winsys_surface *vbuffer_handles[PIPE_MAX_ATTRIBS];
+   unsigned num_vbuffers;
+
     struct svga_winsys_surface *ib;  /**< index buffer for drawing */
     SVGA3dSurfaceFormat ib_format;
     unsigned ib_offset;
@@ -498,6 +506,7 @@ struct svga_context
        uint64_t num_state_objects;    /**< SVGA_QUERY_NUM_STATE_OBJECTS */
        uint64_t num_surface_views;    /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
        uint64_t num_bytes_uploaded;   /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
+      uint64_t num_generate_mipmap;  /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
     } hud;
  
     /** The currently bound stream output targets */
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c

index 2d3631d..80526ed 100644 (file)
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -517,11 +517,27 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
           buffers[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
        }
        if (vbuf_count > 0) {
-         ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count,
-                                              0,    /* startBuffer */
-                                              buffers, vb_handle);
-         if (ret != PIPE_OK)
-            return ret;
+         /* If we haven't yet emitted a drawing command or if any
+          * vertex buffer state is changing, issue that state now.
+          */
+         if (((hwtnl->cmd.swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) == 0) ||
+             vbuf_count != svga->state.hw_draw.num_vbuffers ||
+             memcmp(buffers, svga->state.hw_draw.vbuffers,
+                    vbuf_count * sizeof(buffers[0])) ||
+             memcmp(vb_handle, svga->state.hw_draw.vbuffer_handles,
+                    vbuf_count * sizeof(vb_handle[0]))) {
+            ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, vbuf_count,
+                                                 0,    /* startBuffer */
+                                                 buffers, vb_handle);
+            if (ret != PIPE_OK)
+               return ret;
+
+            svga->state.hw_draw.num_vbuffers = vbuf_count;
+            memcpy(svga->state.hw_draw.vbuffers, buffers,
+                   vbuf_count * sizeof(buffers[0]));
+            memcpy(svga->state.hw_draw.vbuffer_handles, vb_handle,
+                   vbuf_count * sizeof(vb_handle[0]));
+         }
        }
     }
  
diff --git a/src/gallium/drivers/svga/svga_format.c b/src/gallium/drivers/svga/svga_format.c

index 2b549df..0186736 100644 (file)
--- a/src/gallium/drivers/svga/svga_format.c
+++ b/src/gallium/drivers/svga/svga_format.c
@@ -48,16 +48,16 @@ static const struct vgpu10_format_entry format_conversion_table[] =
  {
     /* Gallium format                    SVGA3D vertex format        SVGA3D pixel format          Flags */
     { PIPE_FORMAT_NONE,                  SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_B8G8R8A8_UNORM,        SVGA3D_B8G8R8A8_UNORM,      SVGA3D_B8G8R8A8_UNORM,       0 },
-   { PIPE_FORMAT_B8G8R8X8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8X8_UNORM,       0 },
+   { PIPE_FORMAT_B8G8R8A8_UNORM,        SVGA3D_B8G8R8A8_UNORM,      SVGA3D_B8G8R8A8_UNORM,       TF_GEN_MIPS },
+   { PIPE_FORMAT_B8G8R8X8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8X8_UNORM,       TF_GEN_MIPS },
     { PIPE_FORMAT_A8R8G8B8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_X8R8G8B8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_B5G5R5A1_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B5G5R5A1_UNORM,       0 },
+   { PIPE_FORMAT_B5G5R5A1_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B5G5R5A1_UNORM,       TF_GEN_MIPS },
     { PIPE_FORMAT_B4G4R4A4_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_B5G6R5_UNORM,          SVGA3D_FORMAT_INVALID,      SVGA3D_B5G6R5_UNORM,         0 },
-   { PIPE_FORMAT_R10G10B10A2_UNORM,     SVGA3D_R10G10B10A2_UNORM,   SVGA3D_R10G10B10A2_UNORM,    0 },
+   { PIPE_FORMAT_B5G6R5_UNORM,          SVGA3D_FORMAT_INVALID,      SVGA3D_B5G6R5_UNORM,         TF_GEN_MIPS },
+   { PIPE_FORMAT_R10G10B10A2_UNORM,     SVGA3D_R10G10B10A2_UNORM,   SVGA3D_R10G10B10A2_UNORM,    TF_GEN_MIPS },
     { PIPE_FORMAT_L8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_A8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_A8_UNORM,             0 },
+   { PIPE_FORMAT_A8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_A8_UNORM,             TF_GEN_MIPS },
     { PIPE_FORMAT_I8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_L8A8_UNORM,            SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_L16_UNORM,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
@@ -75,10 +75,10 @@ static const struct vgpu10_format_entry format_conversion_table[] =
     { PIPE_FORMAT_R64G64_FLOAT,          SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R64G64B64_FLOAT,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R64G64B64A64_FLOAT,    SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_R32_FLOAT,             SVGA3D_R32_FLOAT,           SVGA3D_R32_FLOAT,            0 },
-   { PIPE_FORMAT_R32G32_FLOAT,          SVGA3D_R32G32_FLOAT,        SVGA3D_R32G32_FLOAT,         0 },
-   { PIPE_FORMAT_R32G32B32_FLOAT,       SVGA3D_R32G32B32_FLOAT,     SVGA3D_R32G32B32_FLOAT,      0 },
-   { PIPE_FORMAT_R32G32B32A32_FLOAT,    SVGA3D_R32G32B32A32_FLOAT,  SVGA3D_R32G32B32A32_FLOAT,   0 },
+   { PIPE_FORMAT_R32_FLOAT,             SVGA3D_R32_FLOAT,           SVGA3D_R32_FLOAT,            TF_GEN_MIPS },
+   { PIPE_FORMAT_R32G32_FLOAT,          SVGA3D_R32G32_FLOAT,        SVGA3D_R32G32_FLOAT,         TF_GEN_MIPS },
+   { PIPE_FORMAT_R32G32B32_FLOAT,       SVGA3D_R32G32B32_FLOAT,     SVGA3D_R32G32B32_FLOAT,      TF_GEN_MIPS },
+   { PIPE_FORMAT_R32G32B32A32_FLOAT,    SVGA3D_R32G32B32A32_FLOAT,  SVGA3D_R32G32B32A32_FLOAT,   TF_GEN_MIPS },
     { PIPE_FORMAT_R32_UNORM,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R32G32_UNORM,          SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R32G32B32_UNORM,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
@@ -95,10 +95,10 @@ static const struct vgpu10_format_entry format_conversion_table[] =
     { PIPE_FORMAT_R32G32_SSCALED,        SVGA3D_R32G32_SINT,         SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
     { PIPE_FORMAT_R32G32B32_SSCALED,     SVGA3D_R32G32B32_SINT,      SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
     { PIPE_FORMAT_R32G32B32A32_SSCALED,  SVGA3D_R32G32B32A32_SINT,   SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
-   { PIPE_FORMAT_R16_UNORM,             SVGA3D_R16_UNORM,           SVGA3D_R16_UNORM,            0 },
-   { PIPE_FORMAT_R16G16_UNORM,          SVGA3D_R16G16_UNORM,        SVGA3D_R16G16_UNORM,         0 },
+   { PIPE_FORMAT_R16_UNORM,             SVGA3D_R16_UNORM,           SVGA3D_R16_UNORM,            TF_GEN_MIPS },
+   { PIPE_FORMAT_R16G16_UNORM,          SVGA3D_R16G16_UNORM,        SVGA3D_R16G16_UNORM,         TF_GEN_MIPS },
     { PIPE_FORMAT_R16G16B16_UNORM,       SVGA3D_R16G16B16A16_UNORM,  SVGA3D_FORMAT_INVALID,       VF_W_TO_1 },
-   { PIPE_FORMAT_R16G16B16A16_UNORM,    SVGA3D_R16G16B16A16_UNORM,  SVGA3D_R16G16B16A16_UNORM,   0 },
+   { PIPE_FORMAT_R16G16B16A16_UNORM,    SVGA3D_R16G16B16A16_UNORM,  SVGA3D_R16G16B16A16_UNORM,   TF_GEN_MIPS },
     { PIPE_FORMAT_R16_USCALED,           SVGA3D_R16_UINT,            SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
     { PIPE_FORMAT_R16G16_USCALED,        SVGA3D_R16G16_UINT,         SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
     { PIPE_FORMAT_R16G16B16_USCALED,     SVGA3D_R16G16B16A16_UINT,   SVGA3D_FORMAT_INVALID,       VF_W_TO_1 | VF_U_TO_F_CAST },
@@ -111,10 +111,10 @@ static const struct vgpu10_format_entry format_conversion_table[] =
     { PIPE_FORMAT_R16G16_SSCALED,        SVGA3D_R16G16_SINT,         SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
     { PIPE_FORMAT_R16G16B16_SSCALED,     SVGA3D_R16G16B16A16_SINT,   SVGA3D_FORMAT_INVALID,       VF_W_TO_1 | VF_I_TO_F_CAST },
     { PIPE_FORMAT_R16G16B16A16_SSCALED,  SVGA3D_R16G16B16A16_SINT,   SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
-   { PIPE_FORMAT_R8_UNORM,              SVGA3D_R8_UNORM,            SVGA3D_R8_UNORM,             0 },
-   { PIPE_FORMAT_R8G8_UNORM,            SVGA3D_R8G8_UNORM,          SVGA3D_R8G8_UNORM,           0 },
+   { PIPE_FORMAT_R8_UNORM,              SVGA3D_R8_UNORM,            SVGA3D_R8_UNORM,             TF_GEN_MIPS },
+   { PIPE_FORMAT_R8G8_UNORM,            SVGA3D_R8G8_UNORM,          SVGA3D_R8G8_UNORM,           TF_GEN_MIPS },
     { PIPE_FORMAT_R8G8B8_UNORM,          SVGA3D_R8G8B8A8_UNORM,      SVGA3D_FORMAT_INVALID,       VF_W_TO_1 },
-   { PIPE_FORMAT_R8G8B8A8_UNORM,        SVGA3D_R8G8B8A8_UNORM,      SVGA3D_R8G8B8A8_UNORM,       0 },
+   { PIPE_FORMAT_R8G8B8A8_UNORM,        SVGA3D_R8G8B8A8_UNORM,      SVGA3D_R8G8B8A8_UNORM,       TF_GEN_MIPS },
     { PIPE_FORMAT_X8B8G8R8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R8_USCALED,            SVGA3D_R8_UINT,             SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
     { PIPE_FORMAT_R8G8_USCALED,          SVGA3D_R8G8_UINT,           SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
@@ -138,20 +138,20 @@ static const struct vgpu10_format_entry format_conversion_table[] =
     { PIPE_FORMAT_R32G32_FIXED,          SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R32G32B32_FIXED,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R32G32B32A32_FIXED,    SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_R16_FLOAT,             SVGA3D_R16_FLOAT,           SVGA3D_R16_FLOAT,            0 },
-   { PIPE_FORMAT_R16G16_FLOAT,          SVGA3D_R16G16_FLOAT,        SVGA3D_R16G16_FLOAT,         0 },
+   { PIPE_FORMAT_R16_FLOAT,             SVGA3D_R16_FLOAT,           SVGA3D_R16_FLOAT,            TF_GEN_MIPS },
+   { PIPE_FORMAT_R16G16_FLOAT,          SVGA3D_R16G16_FLOAT,        SVGA3D_R16G16_FLOAT,         TF_GEN_MIPS },
     { PIPE_FORMAT_R16G16B16_FLOAT,       SVGA3D_R16G16B16A16_FLOAT,  SVGA3D_FORMAT_INVALID,       VF_W_TO_1 },
-   { PIPE_FORMAT_R16G16B16A16_FLOAT,    SVGA3D_R16G16B16A16_FLOAT,  SVGA3D_R16G16B16A16_FLOAT,   0 },
+   { PIPE_FORMAT_R16G16B16A16_FLOAT,    SVGA3D_R16G16B16A16_FLOAT,  SVGA3D_R16G16B16A16_FLOAT,   TF_GEN_MIPS },
     { PIPE_FORMAT_L8_SRGB,               SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_L8A8_SRGB,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R8G8B8_SRGB,           SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_A8B8G8R8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_X8B8G8R8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_B8G8R8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8A8_UNORM_SRGB,  0 },
-   { PIPE_FORMAT_B8G8R8X8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8X8_UNORM_SRGB,  0 },
+   { PIPE_FORMAT_B8G8R8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8A8_UNORM_SRGB,  TF_GEN_MIPS },
+   { PIPE_FORMAT_B8G8R8X8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8X8_UNORM_SRGB,  TF_GEN_MIPS },
     { PIPE_FORMAT_A8R8G8B8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_X8R8G8B8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_R8G8B8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_R8G8B8A8_UNORM_SRGB,  0 },
+   { PIPE_FORMAT_R8G8B8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_R8G8B8A8_UNORM_SRGB,  TF_GEN_MIPS },
     { PIPE_FORMAT_DXT1_RGB,              SVGA3D_FORMAT_INVALID,      SVGA3D_BC1_UNORM,            0 },
     { PIPE_FORMAT_DXT1_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_BC1_UNORM,            0 },
     { PIPE_FORMAT_DXT3_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_BC2_UNORM,            0 },
@@ -171,7 +171,7 @@ static const struct vgpu10_format_entry format_conversion_table[] =
     { PIPE_FORMAT_A8B8G8R8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_B5G5R5X1_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
     { PIPE_FORMAT_R10G10B10A2_USCALED,   SVGA3D_R10G10B10A2_UNORM,   SVGA3D_FORMAT_INVALID,       VF_PUINT_TO_USCALED },
-   { PIPE_FORMAT_R11G11B10_FLOAT,       SVGA3D_FORMAT_INVALID,      SVGA3D_R11G11B10_FLOAT,      0 },
+   { PIPE_FORMAT_R11G11B10_FLOAT,       SVGA3D_FORMAT_INVALID,      SVGA3D_R11G11B10_FLOAT,      TF_GEN_MIPS },
     { PIPE_FORMAT_R9G9B9E5_FLOAT,        SVGA3D_FORMAT_INVALID,      SVGA3D_R9G9B9E5_SHAREDEXP,   0 },
     { PIPE_FORMAT_Z32_FLOAT_S8X24_UINT,  SVGA3D_FORMAT_INVALID,      SVGA3D_D32_FLOAT_S8X24_UINT, 0 },
     { PIPE_FORMAT_R1_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
@@ -1967,6 +1967,13 @@ svga_format_is_integer(SVGA3dSurfaceFormat format)
     }
  }
  
+boolean
+svga_format_support_gen_mips(enum pipe_format format)
+{
+   assert(format < Elements(format_conversion_table));
+   return ((format_conversion_table[format].flags & TF_GEN_MIPS) > 0);
+}
+
  
  /**
   * Given a texture format, return the expected data type returned from
diff --git a/src/gallium/drivers/svga/svga_format.h b/src/gallium/drivers/svga/svga_format.h

index 9f9a530..630a86a 100644 (file)
--- a/src/gallium/drivers/svga/svga_format.h
+++ b/src/gallium/drivers/svga/svga_format.h
@@ -52,6 +52,10 @@ struct svga_screen;
  #define VF_PUINT_TO_USCALED (1 << 6)  /* 10_10_10_2 to uscaled */
  #define VF_PUINT_TO_SSCALED (1 << 7)  /* 10_10_10_2 to sscaled */
  
+/**
+ * Texture format flags.
+ */
+#define TF_GEN_MIPS         (1 << 8)  /* supports hw generate mipmap */
  
  void
  svga_translate_vertex_format_vgpu10(enum pipe_format format,
@@ -80,6 +84,9 @@ svga_format_name(SVGA3dSurfaceFormat format);
  boolean
  svga_format_is_integer(SVGA3dSurfaceFormat format);
  
+boolean
+svga_format_support_gen_mips(enum pipe_format format);
+
  enum tgsi_return_type
  svga_get_texture_datatype(enum pipe_format format);
  
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c

index b67d56c..255494a 100644 (file)
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -732,6 +732,7 @@ svga_create_query(struct pipe_context *pipe,
     case SVGA_QUERY_NUM_SURFACE_VIEWS:
     case SVGA_QUERY_NUM_RESOURCES_MAPPED:
     case SVGA_QUERY_NUM_BYTES_UPLOADED:
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
        break;
     default:
        assert(!"unexpected query type in svga_create_query()");
@@ -800,6 +801,7 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
     case SVGA_QUERY_NUM_SURFACE_VIEWS:
     case SVGA_QUERY_NUM_RESOURCES_MAPPED:
     case SVGA_QUERY_NUM_BYTES_UPLOADED:
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
        /* nothing */
        break;
     default:
@@ -887,6 +889,7 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
     case SVGA_QUERY_NUM_RESOURCES:
     case SVGA_QUERY_NUM_STATE_OBJECTS:
     case SVGA_QUERY_NUM_SURFACE_VIEWS:
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
        /* nothing */
        break;
     default:
@@ -980,6 +983,7 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
     case SVGA_QUERY_NUM_RESOURCES:
     case SVGA_QUERY_NUM_STATE_OBJECTS:
     case SVGA_QUERY_NUM_SURFACE_VIEWS:
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
        /* nothing */
        break;
     default:
@@ -1090,6 +1094,9 @@ svga_get_query_result(struct pipe_context *pipe,
     case SVGA_QUERY_NUM_SURFACE_VIEWS:
        vresult->u64 = svga->hud.num_surface_views;
        break;
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
+      vresult->u64 = svga->hud.num_generate_mipmap;
+      break;
     default:
        assert(!"unexpected query type in svga_get_query_result");
     }
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c

index fa1744f..8e0db53 100644 (file)
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -368,13 +368,16 @@ static void svga_bind_rasterizer_state( struct pipe_context *pipe,
     struct svga_context *svga = svga_context(pipe);
     struct svga_rasterizer_state *raster = (struct svga_rasterizer_state *)state;
  
+   if (!raster ||
+       !svga->curr.rast ||
+       raster->templ.poly_stipple_enable !=
+       svga->curr.rast->templ.poly_stipple_enable) {
+      svga->dirty |= SVGA_NEW_STIPPLE;
+   }
+
     svga->curr.rast = raster;
  
     svga->dirty |= SVGA_NEW_RAST;
-
-   if (raster && raster->templ.poly_stipple_enable) {
-      svga->dirty |= SVGA_NEW_STIPPLE;
-   }
  }
  
  static void
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c

index 9524117..3e778f0 100644 (file)
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -287,6 +287,7 @@ svga_bind_sampler_states(struct pipe_context *pipe,
  {
     struct svga_context *svga = svga_context(pipe);
     unsigned i;
+   boolean any_change = FALSE;
  
     assert(shader < PIPE_SHADER_TYPES);
     assert(start + num <= PIPE_MAX_SAMPLERS);
@@ -295,8 +296,15 @@ svga_bind_sampler_states(struct pipe_context *pipe,
     if (!svga_have_vgpu10(svga) && shader != PIPE_SHADER_FRAGMENT)
        return;
  
-   for (i = 0; i < num; i++)
+   for (i = 0; i < num; i++) {
+      if (svga->curr.sampler[shader][start + i] != samplers[i])
+         any_change = TRUE;
        svga->curr.sampler[shader][start + i] = samplers[i];
+   }
+
+   if (!any_change) {
+      return;
+   }
  
     /* find highest non-null sampler[] entry */
     {
@@ -405,6 +413,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
     unsigned flag_1d = 0;
     unsigned flag_srgb = 0;
     uint i;
+   boolean any_change = FALSE;
  
     assert(shader < PIPE_SHADER_TYPES);
     assert(start + num <= Elements(svga->curr.sampler_views[shader]));
@@ -422,6 +431,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
           pipe_sampler_view_release(pipe, &svga->curr.sampler_views[shader][start + i]);
           pipe_sampler_view_reference(&svga->curr.sampler_views[shader][start + i],
                                       views[i]);
+         any_change = TRUE;
        }
  
        if (!views[i])
@@ -434,6 +444,10 @@ svga_set_sampler_views(struct pipe_context *pipe,
           flag_1d |= 1 << (start + i);
     }
  
+   if (!any_change) {
+      return;
+   }
+
     /* find highest non-null sampler_views[] entry */
     {
        unsigned j = MAX2(svga->curr.num_sampler_views[shader], start + num);
diff --git a/src/gallium/drivers/svga/svga_resource.c b/src/gallium/drivers/svga/svga_resource.c

index a910ae0..1c3bcd6 100644 (file)
--- a/src/gallium/drivers/svga/svga_resource.c
+++ b/src/gallium/drivers/svga/svga_resource.c
@@ -107,6 +107,12 @@ svga_init_resource_functions(struct svga_context *svga)
     svga->pipe.transfer_flush_region = u_transfer_flush_region_vtbl;
     svga->pipe.transfer_unmap = u_transfer_unmap_vtbl;
     svga->pipe.transfer_inline_write = u_transfer_inline_write_vtbl;
+
+   if (svga_have_vgpu10(svga)) {
+      svga->pipe.generate_mipmap = svga_texture_generate_mipmap;
+   } else {
+      svga->pipe.generate_mipmap = NULL;
+   }
  }
  
  void
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c

index 8c5cff5..7f7ceab 100644 (file)
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -221,7 +221,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
     struct svga_3d_update_gb_image *whole_update_cmd = NULL;
     uint32 numBoxes = sbuf->map.num_ranges;
     struct pipe_resource *dummy;
-   unsigned int i;
+   unsigned i;
  
     assert(numBoxes);
     assert(sbuf->dma.updates == NULL);
@@ -308,6 +308,7 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
     pipe_resource_reference(&dummy, &sbuf->b.b);
     SVGA_FIFOCommitAll(swc);
  
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
     sbuf->dma.flags.discard = FALSE;
  
     return PIPE_OK;
@@ -381,6 +382,7 @@ svga_buffer_upload_command(struct svga_context *svga,
  
     SVGA_FIFOCommitAll(swc);
  
+   swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
     sbuf->dma.flags.discard = FALSE;
  
     return PIPE_OK;
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c

index 4c7aeff..3f754c4 100644 (file)
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -993,3 +993,61 @@ svga_texture_from_handle(struct pipe_screen *screen,
  
     return &tex->b.b;
  }
+
+boolean
+svga_texture_generate_mipmap(struct pipe_context *pipe,
+                             struct pipe_resource *pt,
+                             enum pipe_format format,
+                             unsigned base_level,
+                             unsigned last_level,
+                             unsigned first_layer,
+                             unsigned last_layer)
+{
+   struct pipe_sampler_view templ, *psv;
+   struct svga_pipe_sampler_view *sv;
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_texture *tex = svga_texture(pt);
+   enum pipe_error ret;
+
+   assert(svga_have_vgpu10(svga));
+
+   /* Only support 2D texture for now */
+   if (pt->target != PIPE_TEXTURE_2D)
+      return FALSE;
+
+   /* Fallback to the mipmap generation utility for those formats that
+    * do not support hw generate mipmap
+    */
+   if (!svga_format_support_gen_mips(format))
+      return FALSE;
+
+   /* Make sure the texture surface was created with
+    * SVGA3D_SURFACE_BIND_RENDER_TARGET
+    */
+   if (!tex->handle || !(tex->key.flags & SVGA3D_SURFACE_BIND_RENDER_TARGET))
+      return FALSE;
+
+   templ.format = format;
+   templ.u.tex.first_layer = first_layer;
+   templ.u.tex.last_layer = last_layer;
+   templ.u.tex.first_level = base_level;
+   templ.u.tex.last_level = last_level;
+
+   psv = pipe->create_sampler_view(pipe, pt, &templ);
+   if (psv == NULL)
+      return FALSE;
+
+   sv = svga_pipe_sampler_view(psv);
+   svga_validate_pipe_sampler_view(svga, sv);
+
+   ret = SVGA3D_vgpu10_GenMips(svga->swc, sv->id, tex->handle);
+   if (ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_vgpu10_GenMips(svga->swc, sv->id, tex->handle);
+   }
+   pipe_sampler_view_reference(&psv, NULL);
+
+   svga->hud.num_generate_mipmap++;
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/svga/svga_resource_texture.h b/src/gallium/drivers/svga/svga_resource_texture.h

index 0326907..99ba33b 100644 (file)
--- a/src/gallium/drivers/svga/svga_resource_texture.h
+++ b/src/gallium/drivers/svga/svga_resource_texture.h
@@ -217,7 +217,14 @@ svga_texture_from_handle(struct pipe_screen * screen,
                         const struct pipe_resource *template,
                         struct winsys_handle *whandle);
  
-
+boolean
+svga_texture_generate_mipmap(struct pipe_context *pipe,
+                             struct pipe_resource *pt,
+                             enum pipe_format format,
+                             unsigned base_level,
+                             unsigned last_level,
+                             unsigned first_layer,
+                             unsigned last_layer);
  
  
  #endif /* SVGA_TEXTURE_H */
diff --git a/src/gallium/drivers/svga/svga_sampler_view.h b/src/gallium/drivers/svga/svga_sampler_view.h

index 4ca7fb7..15f2313 100644 (file)
--- a/src/gallium/drivers/svga/svga_sampler_view.h
+++ b/src/gallium/drivers/svga/svga_sampler_view.h
@@ -35,6 +35,7 @@
  struct pipe_context;
  struct pipe_screen;
  struct svga_context;
+struct svga_pipe_sampler_view;
  struct svga_winsys_surface;
  struct svga_surface;
  enum SVGA3dSurfaceFormat;
@@ -102,4 +103,9 @@ boolean
  svga_check_sampler_view_resource_collision(struct svga_context *svga,
                                             struct svga_winsys_surface *res,
                                             unsigned shader);
+
+enum pipe_error
+svga_validate_pipe_sampler_view(struct svga_context *svga,
+                                struct svga_pipe_sampler_view *sv);
+
  #endif
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c

index fca501b..b21634f 100644 (file)
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -319,6 +319,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_PRIMITIVE_RESTART:
        return 1; /* may be a sw fallback, depending on restart index */
  
+   case PIPE_CAP_GENERATE_MIPMAP:
+      return sws->have_vgpu10;
+
     /* Unsupported features */
     case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
     case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -342,6 +345,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
     case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
     case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
     case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
     case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
     case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -349,6 +354,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_VERTEXID_NOBASE:
     case PIPE_CAP_POLYGON_OFFSET_CLAMP:
     case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
        return 0;
     case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
        return 64;
@@ -384,6 +392,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
     case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
        return 0;
     }
  
@@ -457,6 +468,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
        case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
           return 0;
        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
           return 32;
@@ -515,6 +527,7 @@ vgpu9_get_shader_param(struct pipe_screen *screen, unsigned shader,
        case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
           return 0;
        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
           return 32;
@@ -606,6 +619,7 @@ vgpu10_get_shader_param(struct pipe_screen *screen, unsigned shader,
     case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
     case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
        return 0;
     case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
        return 32;
@@ -811,6 +825,8 @@ svga_get_driver_query_info(struct pipe_screen *screen,
              PIPE_DRIVER_QUERY_TYPE_UINT64),
        QUERY("num-surface-views", SVGA_QUERY_NUM_SURFACE_VIEWS,
              PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-generate-mipmap", SVGA_QUERY_NUM_GENERATE_MIPMAP,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
     };
  #undef QUERY
  
diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c

index 722b369..4479a27 100644 (file)
--- a/src/gallium/drivers/svga/svga_state.c
+++ b/src/gallium/drivers/svga/svga_state.c
@@ -129,7 +129,11 @@ update_state(struct svga_context *svga,
               const struct svga_tracked_state *atoms[],
               unsigned *state)
  {
+#ifdef DEBUG
     boolean debug = TRUE;
+#else
+   boolean debug = FALSE;
+#endif
     enum pipe_error ret = PIPE_OK;
     unsigned i;
  
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c

index 2cf4113..8ab1693 100644 (file)
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -613,7 +613,8 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
      */
     new_buf_size = align(new_buf_size, 16);
  
-   u_upload_alloc(svga->const0_upload, 0, new_buf_size, &offset,
+   u_upload_alloc(svga->const0_upload, 0, new_buf_size,
+                  CONST0_UPLOAD_ALIGNMENT, &offset,
                    &dst_buffer, &dst_map);
     if (!dst_map) {
        if (src_map)
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c

index e392778..bac9166 100644 (file)
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -452,6 +452,7 @@ struct svga_tracked_state svga_hw_fs =
      SVGA_NEW_TEXTURE_BINDING |
      SVGA_NEW_NEED_SWTNL |
      SVGA_NEW_RAST |
+    SVGA_NEW_STIPPLE |
      SVGA_NEW_REDUCED_PRIMITIVE |
      SVGA_NEW_SAMPLER |
      SVGA_NEW_FRAME_BUFFER |
diff --git a/src/gallium/drivers/svga/svga_state_sampler.c b/src/gallium/drivers/svga/svga_state_sampler.c

index b070f65..e7b540c 100644 (file)
--- a/src/gallium/drivers/svga/svga_state_sampler.c
+++ b/src/gallium/drivers/svga/svga_state_sampler.c
@@ -90,7 +90,7 @@ svga_check_sampler_view_resource_collision(struct svga_context *svga,
   * Create a DX ShaderResourceSamplerView for the given pipe_sampler_view,
   * if needed.
   */
-static enum pipe_error
+enum pipe_error
  svga_validate_pipe_sampler_view(struct svga_context *svga,
                                  struct svga_pipe_sampler_view *sv)
  {
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c

index 24574c1..a103dab 100644 (file)
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -173,8 +173,11 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
        return;
     }
  
+   /* SVGA_NEW_PRESCALE */
     key->vs.need_prescale = svga->state.hw_clear.prescale.enabled &&
                             (svga->curr.gs == NULL);
+
+   /* SVGA_NEW_RAST */
     key->vs.allow_psiz = svga->curr.rast->templ.point_size_per_vertex;
  
     /* SVGA_NEW_FS */
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c

index 79dc0bf..4d21f4f 100644 (file)
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -220,8 +220,6 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
     struct draw_context *draw = svga->swtnl.draw;
     struct vertex_info *vinfo = &svga_render->vertex_info;
     SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
-   const enum interp_mode colorInterp =
-      svga->curr.rast->templ.flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
     struct svga_fragment_shader *fs = svga->curr.fs;
     int offset = 0;
     int nr_decls = 0;
@@ -236,7 +234,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
  
     /* always add position */
     src = draw_find_shader_output(draw, TGSI_SEMANTIC_POSITION, 0);
-   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, src);
     vinfo->attrib[0].emit = EMIT_4F;
     vdecl[0].array.offset = offset;
     vdecl[0].identity.method = SVGA3D_DECLMETHOD_DEFAULT;
@@ -257,14 +255,14 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
  
        switch (sem_name) {
        case TGSI_SEMANTIC_COLOR:
-         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, src);
           vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_COLOR;
           vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
           offset += 16;
           nr_decls++;
           break;
        case TGSI_SEMANTIC_GENERIC:
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, src);
           vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
           vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
           vdecl[nr_decls].identity.usageIndex =
@@ -273,7 +271,7 @@ svga_swtnl_update_vdecl( struct svga_context *svga )
           nr_decls++;
           break;
        case TGSI_SEMANTIC_FOG:
-         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         draw_emit_vertex_attr(vinfo, EMIT_1F, src);
           vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
           vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT1;
           assert(vdecl[nr_decls].identity.usageIndex == 0);
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c

index dbb90f7..489e68f 100644 (file)
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -166,7 +166,7 @@ scalar(struct src_register src, unsigned comp)
  static boolean
  svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
  {
-   int i;
+   unsigned i;
  
     for (i = 0; i < emit->num_arl_consts; ++i) {
        if (emit->arl_consts[i].arl_num == emit->current_arl)
@@ -179,7 +179,7 @@ svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
  static int
  svga_arl_adjustment( const struct svga_shader_emitter *emit )
  {
-   int i;
+   unsigned i;
  
     for (i = 0; i < emit->num_arl_consts; ++i) {
        if (emit->arl_consts[i].arl_num == emit->current_arl)
@@ -1175,7 +1175,7 @@ emit_div(struct svga_shader_emitter *emit,
     const struct src_register src1 =
        translate_src_register(emit, &insn->Src[1] );
     SVGA3dShaderDestToken temp = get_temp( emit );
-   int i;
+   unsigned i;
  
     /* For each enabled element, perform a RCP instruction.  Note that
      * RCP is scalar in SVGA3D:
@@ -1822,7 +1822,7 @@ emit_tex_swizzle(struct svga_shader_emitter *emit,
     const unsigned swizzleIn[4] = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
     unsigned srcSwizzle[4];
     unsigned srcWritemask = 0x0, zeroWritemask = 0x0, oneWritemask = 0x0;
-   int i;
+   unsigned i;
  
     /* build writemasks and srcSwizzle terms */
     for (i = 0; i < 4; i++) {
@@ -3371,7 +3371,7 @@ emit_light_twoside(struct svga_shader_emitter *emit)
     struct src_register back[2];
     SVGA3dShaderDestToken color[2];
     int count = emit->internal_color_count;
-   int i;
+   unsigned i;
     SVGA3dShaderInstToken if_token;
  
     if (count == 0)
@@ -3698,7 +3698,7 @@ static boolean
  pre_parse_add_indirect( struct svga_shader_emitter *emit,
                          int num, int current_arl)
  {
-   int i;
+   unsigned i;
     assert(num < 0);
  
     for (i = 0; i < emit->num_arl_consts; ++i) {
@@ -3844,7 +3844,8 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit,
     if (emit->unit == PIPE_SHADER_FRAGMENT && emit->key.fs.pstipple) {
        unsigned unit;
  
-      new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0);
+      new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0,
+                                                        TGSI_FILE_INPUT);
  
        if (new_tokens) {
           /* Setup texture state for stipple */
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c

index c979f4a..1223e44 100644 (file)
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -2298,11 +2298,13 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
        emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1);
        return TRUE;
  
+#if 0
     case TGSI_FILE_RESOURCE:
        /*opcode0.opcodeType = VGPU10_OPCODE_DCL_RESOURCE;*/
        /* XXX more, VGPU10_RETURN_TYPE_FLOAT */
        assert(!"TGSI_FILE_RESOURCE not handled yet");
        return FALSE;
+#endif
  
     case TGSI_FILE_ADDRESS:
        emit->num_address_regs = MAX2(emit->num_address_regs,
@@ -6170,6 +6172,11 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit)
  
        while (adjust_mask) {
           unsigned index = u_bit_scan(&adjust_mask);
+
+         /* skip the instruction if this vertex attribute is not being used */
+         if (emit->info.input_usage_mask[index] == 0)
+            continue;
+
           unsigned tmp = emit->vs.adjusted_input[index];
           struct tgsi_full_src_register input_src =
              make_src_reg(TGSI_FILE_INPUT, index);
@@ -6604,7 +6611,8 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit,
        tgsi_dump(tokens,0);
     }
  
-   new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0);
+   new_tokens = util_pstipple_create_fragment_shader(tokens, &unit, 0,
+                                                     TGSI_FILE_INPUT);
  
     emit->fs.pstipple_sampler_unit = unit;
  
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h

index 3129e46..562c669 100644 (file)
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -85,7 +85,7 @@ struct winsys_handle;
  #define SVGA_QUERY_FLAG_SET        (1 << 0)
  #define SVGA_QUERY_FLAG_REF        (1 << 1)
  
-#define SVGA_HINT_FLAG_DRAW_EMITTED (1 << 0)
+#define SVGA_HINT_FLAG_CAN_PRE_FLUSH (1 << 0)  /* Can preemptively flush */
  
  /** Opaque surface handle */
  struct svga_winsys_surface;
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c

index d4c88c9..b5ab924 100644 (file)
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -1291,6 +1291,42 @@ trace_context_flush(struct pipe_context *_pipe,
  }
  
  
+static inline boolean
+trace_context_generate_mipmap(struct pipe_context *_pipe,
+                              struct pipe_resource *res,
+                              enum pipe_format format,
+                              unsigned base_level,
+                              unsigned last_level,
+                              unsigned first_layer,
+                              unsigned last_layer)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   boolean ret;
+
+   res = trace_resource_unwrap(tr_ctx, res);
+
+   trace_dump_call_begin("pipe_context", "generate_mipmap");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, res);
+
+   trace_dump_arg(format, format);
+   trace_dump_arg(uint, base_level);
+   trace_dump_arg(uint, last_level);
+   trace_dump_arg(uint, first_layer);
+   trace_dump_arg(uint, last_layer);
+
+   ret = pipe->generate_mipmap(pipe, res, format, base_level, last_level,
+                               first_layer, last_layer);
+
+   trace_dump_ret(bool, ret);
+   trace_dump_call_end();
+
+   return ret;
+}
+
+
  static inline void
  trace_context_destroy(struct pipe_context *_pipe)
  {
@@ -1620,6 +1656,7 @@ trace_context_create(struct trace_screen *tr_scr,
     TR_CTX_INIT(clear_render_target);
     TR_CTX_INIT(clear_depth_stencil);
     TR_CTX_INIT(flush);
+   TR_CTX_INIT(generate_mipmap);
     TR_CTX_INIT(texture_barrier);
     TR_CTX_INIT(memory_barrier);
     TR_CTX_INIT(set_tess_state);
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources

index 24b577a..a9a2742 100644 (file)
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -32,6 +32,7 @@ C_SOURCES := \
         vc4_program.c \
         vc4_qir.c \
         vc4_qir_lower_uniforms.c \
+       vc4_qir_schedule.c \
         vc4_qir.h \
         vc4_qpu.c \
         vc4_qpu_defines.h \
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c

index 16dcece..876c296 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -54,8 +54,8 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
          bool old_msaa = vc4->msaa;
          int old_tile_width = vc4->tile_width;
          int old_tile_height = vc4->tile_height;
-        bool msaa = (info->src.resource->nr_samples ||
-                     info->dst.resource->nr_samples);
+        bool msaa = (info->src.resource->nr_samples > 1 ||
+                     info->dst.resource->nr_samples > 1);
          int tile_width = msaa ? 32 : 64;
          int tile_height = msaa ? 32 : 64;
  
@@ -110,9 +110,11 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
  
          pipe_surface_reference(&vc4->color_read, src_surf);
          pipe_surface_reference(&vc4->color_write,
-                               dst_surf->texture->nr_samples ? NULL : dst_surf);
+                               dst_surf->texture->nr_samples > 1 ?
+                               NULL : dst_surf);
          pipe_surface_reference(&vc4->msaa_color_write,
-                               dst_surf->texture->nr_samples ? dst_surf : NULL);
+                               dst_surf->texture->nr_samples > 1 ?
+                               dst_surf : NULL);
          pipe_surface_reference(&vc4->zs_read, NULL);
          pipe_surface_reference(&vc4->zs_write, NULL);
          pipe_surface_reference(&vc4->msaa_zs_write, NULL);
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c

index 1cd1676..a0888f2 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -67,15 +67,13 @@ vc4_flush(struct pipe_context *pctx)
          cl_u8(&bcl, VC4_PACKET_FLUSH);
          cl_end(&vc4->bcl, bcl);
  
-        vc4->msaa = false;
          if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
                  pipe_surface_reference(&vc4->color_write,
-                                       cbuf->texture->nr_samples ? NULL : cbuf);
+                                       cbuf->texture->nr_samples > 1 ?
+                                       NULL : cbuf);
                  pipe_surface_reference(&vc4->msaa_color_write,
-                                       cbuf->texture->nr_samples ? cbuf : NULL);
-
-                if (cbuf->texture->nr_samples)
-                        vc4->msaa = true;
+                                       cbuf->texture->nr_samples > 1 ?
+                                       cbuf : NULL);
  
                  if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) {
                          pipe_surface_reference(&vc4->color_read, cbuf);
@@ -92,15 +90,12 @@ vc4_flush(struct pipe_context *pctx)
          if (vc4->framebuffer.zsbuf &&
              (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
                  pipe_surface_reference(&vc4->zs_write,
-                                       zsbuf->texture->nr_samples ?
+                                       zsbuf->texture->nr_samples > 1 ?
                                         NULL : zsbuf);
                  pipe_surface_reference(&vc4->msaa_zs_write,
-                                       zsbuf->texture->nr_samples ?
+                                       zsbuf->texture->nr_samples > 1 ?
                                         zsbuf : NULL);
  
-                if (zsbuf->texture->nr_samples)
-                        vc4->msaa = true;
-
                  if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
                          pipe_surface_reference(&vc4->zs_read, zsbuf);
                  } else {
@@ -259,8 +254,9 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
          if (!vc4->primconvert)
                  goto fail;
  
-        vc4->uploader = u_upload_create(pctx, 16 * 1024, 4,
-                                        PIPE_BIND_INDEX_BUFFER);
+        vc4->uploader = u_upload_create(pctx, 16 * 1024,
+                                        PIPE_BIND_INDEX_BUFFER,
+                                        PIPE_USAGE_STREAM);
  
          vc4_debug |= saved_shaderdb_flag;
  
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c

index c008556..9b0b540 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -319,7 +319,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                          if (vc4->indexbuf.user_buffer) {
                                  prsc = NULL;
                                  u_upload_data(vc4->uploader, 0,
-                                              info->count * index_size,
+                                              info->count * index_size, 4,
                                                vc4->indexbuf.user_buffer,
                                                &offset, &prsc);
                          } else {
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h

index bf58c6c..110c783 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -32,6 +32,7 @@
  #define DRM_VC4_CREATE_BO                         0x03
  #define DRM_VC4_MMAP_BO                           0x04
  #define DRM_VC4_CREATE_SHADER_BO                  0x05
+#define DRM_VC4_GET_HANG_STATE                    0x06
  
  #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
  #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -39,6 +40,7 @@
  #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
  #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
  #define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
+#define DRM_IOCTL_VC4_GET_HANG_STATE      DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state)
  
  struct drm_vc4_submit_rcl_surface {
         uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -231,4 +233,47 @@ struct drm_vc4_mmap_bo {
         uint64_t offset;
  };
  
+struct drm_vc4_get_hang_state_bo {
+       uint32_t handle;
+       uint32_t paddr;
+       uint32_t size;
+       uint32_t pad;
+};
+
+/**
+ * struct drm_vc4_hang_state - ioctl argument for collecting state
+ * from a GPU hang for analysis.
+*/
+struct drm_vc4_get_hang_state {
+       /** Pointer to array of struct drm_vc4_get_hang_state_bo. */
+       uint64_t bo;
+       /**
+        * On input, the size of the bo array.  Output is the number
+        * of bos to be returned.
+        */
+       uint32_t bo_count;
+
+       uint32_t start_bin, start_render;
+
+       uint32_t ct0ca, ct0ea;
+       uint32_t ct1ca, ct1ea;
+       uint32_t ct0cs, ct1cs;
+       uint32_t ct0ra0, ct1ra0;
+
+       uint32_t bpca, bpcs;
+       uint32_t bpoa, bpos;
+
+       uint32_t vpmbase;
+
+       uint32_t dbge;
+       uint32_t fdbgo;
+       uint32_t fdbgb;
+       uint32_t fdbgr;
+       uint32_t fdbgs;
+       uint32_t errstat;
+
+       /* Pad that we may save more registers into in the future. */
+       uint32_t pad[16];
+};
+
  #endif /* _UAPI_VC4_DRM_H_ */
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c

index 79bf2c4..5d071ec 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -89,7 +89,7 @@ vc4_submit_setup_rcl_surface(struct vc4_context *vc4,
          submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
          submit_surf->offset = surf->offset;
  
-        if (psurf->texture->nr_samples == 0) {
+        if (psurf->texture->nr_samples <= 1) {
                  if (is_depth) {
                          submit_surf->bits =
                                  VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS,
@@ -132,7 +132,7 @@ vc4_submit_setup_rcl_render_config_surface(struct vc4_context *vc4,
          submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
          submit_surf->offset = surf->offset;
  
-        if (psurf->texture->nr_samples == 0) {
+        if (psurf->texture->nr_samples <= 1) {
                  submit_surf->bits =
                          VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ?
                                        VC4_RENDER_CONFIG_FORMAT_BGR565 :
@@ -240,9 +240,11 @@ vc4_job_submit(struct vc4_context *vc4)
  #else
                  ret = vc4_simulator_flush(vc4, &submit);
  #endif
-                if (ret) {
-                        fprintf(stderr, "VC4 submit failed\n");
-                        abort();
+                static bool warned = false;
+                if (ret && !warned) {
+                        fprintf(stderr, "Draw call returned %s.  "
+                                        "Expect corruption.\n", strerror(errno));
+                        warned = true;
                  }
          }
  
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c

index a1ec4c7..6d9a624 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -712,12 +712,12 @@ vc4_nir_lower_blend_block(nir_block *block, void *state)
  void
  vc4_nir_lower_blend(struct vc4_compile *c)
  {
-        nir_foreach_overload(c->s, overload) {
-                if (overload->impl) {
-                        nir_foreach_block(overload->impl,
+        nir_foreach_function(c->s, function) {
+                if (function->impl) {
+                        nir_foreach_block(function->impl,
                                            vc4_nir_lower_blend_block, c);
  
-                        nir_metadata_preserve(overload->impl,
+                        nir_metadata_preserve(function->impl,
                                                nir_metadata_block_index |
                                                nir_metadata_dominance);
                  }
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c

index 465b288..bf6631e 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -467,8 +467,8 @@ vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl)
  void
  vc4_nir_lower_io(struct vc4_compile *c)
  {
-        nir_foreach_overload(c->s, overload) {
-                if (overload->impl)
-                        vc4_nir_lower_io_impl(c, overload->impl);
+        nir_foreach_function(c->s, function) {
+                if (function->impl)
+                        vc4_nir_lower_io_impl(c, function->impl);
          }
  }
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c

index 54873e6..2490819 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -165,8 +165,8 @@ vc4_nir_lower_txf_ms_impl(struct vc4_compile *c, nir_function_impl *impl)
  void
  vc4_nir_lower_txf_ms(struct vc4_compile *c)
  {
-        nir_foreach_overload(c->s, overload) {
-                if (overload->impl)
-                        vc4_nir_lower_txf_ms_impl(c, overload->impl);
+        nir_foreach_function(c->s, function) {
+                if (function->impl)
+                        vc4_nir_lower_txf_ms_impl(c, function->impl);
          }
  }
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c

index aea2b9d..b8ce377 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -145,43 +145,6 @@ qir_opt_algebraic(struct vc4_compile *c)
  
          list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                  switch (inst->op) {
-                case QOP_SEL_X_Y_ZS:
-                case QOP_SEL_X_Y_ZC:
-                case QOP_SEL_X_Y_NS:
-                case QOP_SEL_X_Y_NC:
-                case QOP_SEL_X_Y_CS:
-                case QOP_SEL_X_Y_CC:
-                        if (is_zero(c, inst->src[1])) {
-                                /* Replace references to a 0 uniform value
-                                 * with the SEL_X_0 equivalent.
-                                 */
-                                dump_from(c, inst);
-                                inst->op -= (QOP_SEL_X_Y_ZS - QOP_SEL_X_0_ZS);
-                                inst->src[1] = c->undef;
-                                progress = true;
-                                dump_to(c, inst);
-                                break;
-                        }
-
-                        if (is_zero(c, inst->src[0])) {
-                                /* Replace references to a 0 uniform value
-                                 * with the SEL_X_0 equivalent, flipping the
-                                 * condition being evaluated since the operand
-                                 * order is flipped.
-                                 */
-                                dump_from(c, inst);
-                                inst->op -= QOP_SEL_X_Y_ZS;
-                                inst->op ^= 1;
-                                inst->op += QOP_SEL_X_0_ZS;
-                                inst->src[0] = inst->src[1];
-                                inst->src[1] = c->undef;
-                                progress = true;
-                                dump_to(c, inst);
-                                break;
-                        }
-
-                        break;
-
                  case QOP_FMIN:
                          if (is_1f(c, inst->src[1]) &&
                              inst->src[0].pack >= QPU_UNPACK_8D_REP &&
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c

index caad05c..3e402d0 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -89,7 +89,7 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
                  range->dst_offset = c->next_ubo_dst_offset;
                  c->next_ubo_dst_offset += range->size;
                  c->num_ubo_ranges++;
-        };
+        }
  
          offset -= range->src_offset;
  
@@ -204,27 +204,6 @@ ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
          return r;
  };
  
-static struct qreg
-get_swizzled_channel(struct vc4_compile *c,
-                     struct qreg *srcs, int swiz)
-{
-        switch (swiz) {
-        default:
-        case UTIL_FORMAT_SWIZZLE_NONE:
-                fprintf(stderr, "warning: unknown swizzle\n");
-                /* FALLTHROUGH */
-        case UTIL_FORMAT_SWIZZLE_0:
-                return qir_uniform_f(c, 0.0);
-        case UTIL_FORMAT_SWIZZLE_1:
-                return qir_uniform_f(c, 1.0);
-        case UTIL_FORMAT_SWIZZLE_X:
-        case UTIL_FORMAT_SWIZZLE_Y:
-        case UTIL_FORMAT_SWIZZLE_Z:
-        case UTIL_FORMAT_SWIZZLE_W:
-                return srcs[swiz];
-        }
-}
-
  static inline struct qreg
  qir_SAT(struct vc4_compile *c, struct qreg val)
  {
@@ -275,7 +254,7 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
                                     qir_uniform_f(c, 2.4));
  
          qir_SF(c, qir_FSUB(c, srgb, qir_uniform_f(c, 0.04045)));
-        return qir_SEL_X_Y_NS(c, low, high);
+        return qir_SEL(c, QPU_COND_NS, low, high);
  }
  
  static struct qreg
@@ -338,30 +317,20 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
          struct qreg tex = qir_TEX_RESULT(c);
          c->num_texture_samples++;
  
-        struct qreg texture_output[4];
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
          enum pipe_format format = c->key->tex[unit].format;
          if (util_format_is_depth_or_stencil(format)) {
                  struct qreg scaled = ntq_scale_depth_texture(c, tex);
                  for (int i = 0; i < 4; i++)
-                        texture_output[i] = scaled;
+                        dest[i] = scaled;
          } else {
-                struct qreg tex_result_unpacked[4];
                  for (int i = 0; i < 4; i++)
-                        tex_result_unpacked[i] = qir_UNPACK_8_F(c, tex, i);
-
-                const uint8_t *format_swiz =
-                        vc4_get_format_swizzle(c->key->tex[unit].format);
-                for (int i = 0; i < 4; i++) {
-                        texture_output[i] =
-                                get_swizzled_channel(c, tex_result_unpacked,
-                                                     format_swiz[i]);
-                }
+                        dest[i] = qir_UNPACK_8_F(c, tex, i);
          }
  
-        struct qreg *dest = ntq_get_dest(c, &instr->dest);
          for (int i = 0; i < 4; i++) {
-                dest[i] = get_swizzled_channel(c, texture_output,
-                                               c->key->tex[unit].swizzle[i]);
+                if (c->tex_srgb_decode[unit] & (1 << i))
+                        dest[i] = qir_srgb_decode(c, dest[i]);
          }
  }
  
@@ -470,12 +439,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
  
          enum pipe_format format = c->key->tex[unit].format;
  
-        struct qreg unpacked[4];
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
          if (util_format_is_depth_or_stencil(format)) {
                  struct qreg normalized = ntq_scale_depth_texture(c, tex);
                  struct qreg depth_output;
  
-                struct qreg one = qir_uniform_f(c, 1.0f);
+                struct qreg u0 = qir_uniform_f(c, 0.0f);
+                struct qreg u1 = qir_uniform_f(c, 1.0f);
                  if (c->key->tex[unit].compare_mode) {
                          if (has_proj)
                                  compare = qir_FMUL(c, compare, proj);
@@ -485,31 +455,31 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                  depth_output = qir_uniform_f(c, 0.0f);
                                  break;
                          case PIPE_FUNC_ALWAYS:
-                                depth_output = one;
+                                depth_output = u1;
                                  break;
                          case PIPE_FUNC_EQUAL:
                                  qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_ZS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
                                  break;
                          case PIPE_FUNC_NOTEQUAL:
                                  qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_ZC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
                                  break;
                          case PIPE_FUNC_GREATER:
                                  qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_NC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
                                  break;
                          case PIPE_FUNC_GEQUAL:
                                  qir_SF(c, qir_FSUB(c, normalized, compare));
-                                depth_output = qir_SEL_X_0_NS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
                                  break;
                          case PIPE_FUNC_LESS:
                                  qir_SF(c, qir_FSUB(c, compare, normalized));
-                                depth_output = qir_SEL_X_0_NS(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
                                  break;
                          case PIPE_FUNC_LEQUAL:
                                  qir_SF(c, qir_FSUB(c, normalized, compare));
-                                depth_output = qir_SEL_X_0_NC(c, one);
+                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
                                  break;
                          }
                  } else {
@@ -517,29 +487,15 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                  }
  
                  for (int i = 0; i < 4; i++)
-                        unpacked[i] = depth_output;
+                        dest[i] = depth_output;
          } else {
                  for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
-        }
-
-        const uint8_t *format_swiz = vc4_get_format_swizzle(format);
-        struct qreg texture_output[4];
-        for (int i = 0; i < 4; i++) {
-                texture_output[i] = get_swizzled_channel(c, unpacked,
-                                                         format_swiz[i]);
+                        dest[i] = qir_UNPACK_8_F(c, tex, i);
          }
  
-        if (util_format_is_srgb(format)) {
-                for (int i = 0; i < 3; i++)
-                        texture_output[i] = qir_srgb_decode(c,
-                                                            texture_output[i]);
-        }
-
-        struct qreg *dest = ntq_get_dest(c, &instr->dest);
          for (int i = 0; i < 4; i++) {
-                dest[i] = get_swizzled_channel(c, texture_output,
-                                               c->key->tex[unit].swizzle[i]);
+                if (c->tex_srgb_decode[unit] & (1 << i))
+                        dest[i] = qir_srgb_decode(c, dest[i]);
          }
  }
  
@@ -553,9 +509,8 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
          struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
          struct qreg diff = qir_FSUB(c, src, trunc);
          qir_SF(c, diff);
-        return qir_SEL_X_Y_NS(c,
-                              qir_FADD(c, diff, qir_uniform_f(c, 1.0)),
-                              diff);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff);
  }
  
  /**
@@ -572,9 +527,8 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src)
           */
          qir_SF(c, qir_FSUB(c, src, trunc));
  
-        return qir_SEL_X_Y_NS(c,
-                              qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)),
-                              trunc);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc);
  }
  
  /**
@@ -591,9 +545,8 @@ ntq_fceil(struct vc4_compile *c, struct qreg src)
           */
          qir_SF(c, qir_FSUB(c, trunc, src));
  
-        return qir_SEL_X_Y_NS(c,
-                              qir_FADD(c, trunc, qir_uniform_f(c, 1.0)),
-                              trunc);
+        return qir_SEL(c, QPU_COND_NS,
+                       qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc);
  }
  
  static struct qreg
@@ -668,10 +621,13 @@ ntq_fcos(struct vc4_compile *c, struct qreg src)
  static struct qreg
  ntq_fsign(struct vc4_compile *c, struct qreg src)
  {
+        struct qreg t = qir_get_temp(c);
+
          qir_SF(c, src);
-        return qir_SEL_X_Y_NC(c,
-                              qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0)),
-                              qir_uniform_f(c, -1.0));
+        qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
+        qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
+        qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
+        return t;
  }
  
  static void
@@ -888,6 +844,100 @@ ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
          return qir_UNPACK_8_I(c, base, offset_bit / 8);
  }
  
+/**
+ * If compare_instr is a valid comparison instruction, emits the
+ * compare_instr's comparison and returns the sel_instr's return value based
+ * on the compare_instr's result.
+ */
+static bool
+ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
+                    nir_alu_instr *compare_instr,
+                    nir_alu_instr *sel_instr)
+{
+        enum qpu_cond cond;
+
+        switch (compare_instr->op) {
+        case nir_op_feq:
+        case nir_op_ieq:
+        case nir_op_seq:
+                cond = QPU_COND_ZS;
+                break;
+        case nir_op_fne:
+        case nir_op_ine:
+        case nir_op_sne:
+                cond = QPU_COND_ZC;
+                break;
+        case nir_op_fge:
+        case nir_op_ige:
+        case nir_op_uge:
+        case nir_op_sge:
+                cond = QPU_COND_NC;
+                break;
+        case nir_op_flt:
+        case nir_op_ilt:
+        case nir_op_slt:
+                cond = QPU_COND_NS;
+                break;
+        default:
+                return false;
+        }
+
+        struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
+        struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
+
+        if (nir_op_infos[compare_instr->op].input_types[0] == nir_type_float)
+                qir_SF(c, qir_FSUB(c, src0, src1));
+        else
+                qir_SF(c, qir_SUB(c, src0, src1));
+
+        switch (sel_instr->op) {
+        case nir_op_seq:
+        case nir_op_sne:
+        case nir_op_sge:
+        case nir_op_slt:
+                *dest = qir_SEL(c, cond,
+                                qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
+                break;
+
+        case nir_op_bcsel:
+                *dest = qir_SEL(c, cond,
+                                ntq_get_alu_src(c, sel_instr, 1),
+                                ntq_get_alu_src(c, sel_instr, 2));
+                break;
+
+        default:
+                *dest = qir_SEL(c, cond,
+                                qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
+                break;
+        }
+
+        return true;
+}
+
+/**
+ * Attempts to fold a comparison generating a boolean result into the
+ * condition code for selecting between two values, instead of comparing the
+ * boolean result against 0 to generate the condition code.
+ */
+static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
+                                  struct qreg *src)
+{
+        if (!instr->src[0].src.is_ssa)
+                goto out;
+        nir_alu_instr *compare =
+                nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        if (!compare)
+                goto out;
+
+        struct qreg dest;
+        if (ntq_emit_comparison(c, &dest, compare, instr))
+                return dest;
+
+out:
+        qir_SF(c, src[0]);
+        return qir_SEL(c, QPU_COND_NS, src[1], src[2]);
+}
+
  static void
  ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
  {
@@ -974,7 +1024,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
          case nir_op_i2b:
          case nir_op_f2b:
                  qir_SF(c, src[0]);
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
+                *dest = qir_SEL(c, QPU_COND_ZC,
+                                qir_uniform_ui(c, ~0),
+                                qir_uniform_ui(c, 0));
                  break;
  
          case nir_op_iadd:
@@ -1016,65 +1068,29 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                  break;
  
          case nir_op_seq:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_f(c, 1.0));
-                break;
          case nir_op_sne:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_f(c, 1.0));
-                break;
          case nir_op_sge:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_f(c, 1.0));
-                break;
          case nir_op_slt:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_f(c, 1.0));
-                break;
          case nir_op_feq:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
-                break;
          case nir_op_fne:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
-                break;
          case nir_op_fge:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
-                break;
          case nir_op_flt:
-                qir_SF(c, qir_FSUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
-                break;
          case nir_op_ieq:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZS(c, qir_uniform_ui(c, ~0));
-                break;
          case nir_op_ine:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_ZC(c, qir_uniform_ui(c, ~0));
-                break;
          case nir_op_ige:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NC(c, qir_uniform_ui(c, ~0));
-                break;
          case nir_op_uge:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_CC(c, qir_uniform_ui(c, ~0));
-                break;
          case nir_op_ilt:
-                qir_SF(c, qir_SUB(c, src[0], src[1]));
-                *dest = qir_SEL_X_0_NS(c, qir_uniform_ui(c, ~0));
+                if (!ntq_emit_comparison(c, dest, instr, instr)) {
+                        fprintf(stderr, "Bad comparison instruction\n");
+                }
                  break;
  
          case nir_op_bcsel:
-                qir_SF(c, src[0]);
-                *dest = qir_SEL_X_Y_NS(c, src[1], src[2]);
+                *dest = ntq_emit_bcsel(c, instr, src);
                  break;
          case nir_op_fcsel:
                  qir_SF(c, src[0]);
-                *dest = qir_SEL_X_Y_ZC(c, src[1], src[2]);
+                *dest = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
                  break;
  
          case nir_op_frcp:
@@ -1705,10 +1721,10 @@ nir_to_qir(struct vc4_compile *c)
          ntq_setup_registers(c, &c->s->registers);
  
          /* Find the main function and emit the body. */
-        nir_foreach_overload(c->s, overload) {
-                assert(strcmp(overload->function->name, "main") == 0);
-                assert(overload->impl);
-                ntq_emit_impl(c, overload->impl);
+        nir_foreach_function(c->s, function) {
+                assert(strcmp(function->name, "main") == 0);
+                assert(function->impl);
+                ntq_emit_impl(c, function->impl);
          }
  }
  
@@ -1735,10 +1751,10 @@ static int
  count_nir_instrs(nir_shader *nir)
  {
          int count = 0;
-        nir_foreach_overload(nir, overload) {
-                if (!overload->impl)
+        nir_foreach_function(nir, function) {
+                if (!function->impl)
                          continue;
-                nir_foreach_block(overload->impl, count_nir_instrs_in_block, &count);
+                nir_foreach_block(function->impl, count_nir_instrs_in_block, &count);
          }
          return count;
  }
@@ -1789,6 +1805,56 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
          if (stage == QSTAGE_FRAG)
                  vc4_nir_lower_blend(c);
  
+        struct nir_lower_tex_options tex_options = {
+                /* We would need to implement txs, but we don't want the
+                 * int/float conversions
+                 */
+                .lower_rect = false,
+
+                /* We want to use this, but we don't want to newton-raphson
+                 * its rcp.
+                 */
+                .lower_txp = false,
+
+                /* Apply swizzles to all samplers. */
+                .swizzle_result = ~0,
+        };
+
+        /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
+         * The format swizzling applies before sRGB decode, and
+         * ARB_texture_swizzle is the last thing before returning the sample.
+         */
+        for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
+                enum pipe_format format = c->key->tex[i].format;
+
+                if (!format)
+                        continue;
+
+                const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
+
+                for (int j = 0; j < 4; j++) {
+                        uint8_t arb_swiz = c->key->tex[i].swizzle[j];
+
+                        if (arb_swiz <= 3) {
+                                tex_options.swizzles[i][j] =
+                                        format_swizzle[arb_swiz];
+                        } else {
+                                tex_options.swizzles[i][j] = arb_swiz;
+                        }
+
+                        /* If ARB_texture_swizzle is reading from the R, G, or
+                         * B channels of an sRGB texture, then we need to
+                         * apply sRGB decode to this channel at sample time.
+                         */
+                        if (arb_swiz < 3 && util_format_is_srgb(format)) {
+                                c->tex_srgb_decode[i] |= (1 << j);
+                        }
+
+                }
+        }
+
+        nir_lower_tex(c->s, &tex_options);
+
          if (c->fs_key && c->fs_key->light_twoside)
                  nir_lower_two_sided_color(c->s);
  
@@ -1848,12 +1914,15 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
          qir_optimize(c);
          qir_lower_uniforms(c);
  
+        qir_schedule_instructions(c);
+
          if (vc4_debug & VC4_DEBUG_QIR) {
                  fprintf(stderr, "%s prog %d/%d QIR:\n",
                          qir_get_stage_name(c->stage),
                          c->program_id, c->variant_id);
                  qir_dump(c);
          }
+
          qir_reorder_uniforms(c);
          vc4_generate_code(vc4, c);
  
@@ -2043,7 +2112,7 @@ vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
                  key->tex[i].swizzle[2] = sampler->swizzle_b;
                  key->tex[i].swizzle[3] = sampler->swizzle_a;
  
-                if (sampler->texture->nr_samples) {
+                if (sampler->texture->nr_samples > 1) {
                          key->tex[i].msaa_width = sampler->texture->width0;
                          key->tex[i].msaa_height = sampler->texture->height0;
                  } else if (sampler){
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c

index c6916c4..efbb69b 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -65,19 +65,6 @@ static const struct qir_op_info qir_op_info[] = {
          [QOP_XOR] = { "xor", 1, 2 },
          [QOP_NOT] = { "not", 1, 1 },
  
-        [QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1, false, true },
-        [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1, false, true },
-        [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1, false, true },
-        [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1, false, true },
-        [QOP_SEL_X_0_CS] = { "fsel_x_0_cs", 1, 1, false, true },
-        [QOP_SEL_X_0_CC] = { "fsel_x_0_cc", 1, 1, false, true },
-        [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2, false, true },
-        [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2, false, true },
-        [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2, false, true },
-        [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2, false, true },
-        [QOP_SEL_X_Y_CS] = { "fsel_x_y_cs", 1, 2, false, true },
-        [QOP_SEL_X_Y_CC] = { "fsel_x_y_cc", 1, 2, false, true },
-
          [QOP_RCP] = { "rcp", 1, 1, false, true },
          [QOP_RSQ] = { "rsq", 1, 1, false, true },
          [QOP_EXP2] = { "exp2", 1, 2, false, true },
@@ -219,23 +206,8 @@ qir_is_tex(struct qinst *inst)
  bool
  qir_depends_on_flags(struct qinst *inst)
  {
-        switch (inst->op) {
-        case QOP_SEL_X_0_NS:
-        case QOP_SEL_X_0_NC:
-        case QOP_SEL_X_0_ZS:
-        case QOP_SEL_X_0_ZC:
-        case QOP_SEL_X_0_CS:
-        case QOP_SEL_X_0_CC:
-        case QOP_SEL_X_Y_NS:
-        case QOP_SEL_X_Y_NC:
-        case QOP_SEL_X_Y_ZS:
-        case QOP_SEL_X_Y_ZC:
-        case QOP_SEL_X_Y_CS:
-        case QOP_SEL_X_Y_CC:
-                return true;
-        default:
-                return false;
-        }
+        return (inst->cond != QPU_COND_ALWAYS &&
+                inst->cond != QPU_COND_NEVER);
  }
  
  bool
@@ -292,8 +264,19 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
  void
  qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
  {
-        fprintf(stderr, "%s%s ",
+        static const char *conditions[] = {
+                [QPU_COND_ALWAYS] = "",
+                [QPU_COND_NEVER] = ".never",
+                [QPU_COND_ZS] = ".zs",
+                [QPU_COND_ZC] = ".zc",
+                [QPU_COND_NS] = ".ns",
+                [QPU_COND_NC] = ".nc",
+                [QPU_COND_CS] = ".cs",
+                [QPU_COND_CC] = ".cc",
+        };
+        fprintf(stderr, "%s%s%s ",
                  qir_get_op_name(inst->op),
+                conditions[inst->cond],
                  inst->sf ? ".sf" : "");
  
          qir_print_reg(c, inst->dst, true);
@@ -352,6 +335,7 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
          inst->src = calloc(2, sizeof(inst->src[0]));
          inst->src[0] = src0;
          inst->src[1] = src1;
+        inst->cond = QPU_COND_ALWAYS;
  
          return inst;
  }
@@ -503,9 +487,9 @@ qir_SF(struct vc4_compile *c, struct qreg src)
          if (!list_empty(&c->instructions))
                  last_inst = (struct qinst *)c->instructions.prev;
  
-        if (!last_inst ||
-            last_inst->dst.file != src.file ||
-            last_inst->dst.index != src.index ||
+        if (src.file != QFILE_TEMP ||
+            !c->defs[src.index] ||
+            last_inst != c->defs[src.index] ||
              qir_is_multi_instruction(last_inst)) {
                  src = qir_MOV(c, src);
                  last_inst = (struct qinst *)c->instructions.prev;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h

index c34dce3..4ab4d35 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -93,23 +93,6 @@ enum qop {
          QOP_XOR,
          QOP_NOT,
  
-        /* Note: Orderings of these compares must be the same as in
-         * qpu_defines.h.  Selects the src[0] if the ns flag bit is set,
-         * otherwise 0. */
-        QOP_SEL_X_0_ZS,
-        QOP_SEL_X_0_ZC,
-        QOP_SEL_X_0_NS,
-        QOP_SEL_X_0_NC,
-        QOP_SEL_X_0_CS,
-        QOP_SEL_X_0_CC,
-        /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */
-        QOP_SEL_X_Y_ZS,
-        QOP_SEL_X_Y_ZC,
-        QOP_SEL_X_Y_NS,
-        QOP_SEL_X_Y_NC,
-        QOP_SEL_X_Y_CS,
-        QOP_SEL_X_Y_CC,
-
          QOP_FTOI,
          QOP_ITOF,
          QOP_RCP,
@@ -170,6 +153,7 @@ struct qinst {
          struct qreg dst;
          struct qreg *src;
          bool sf;
+        uint8_t cond;
  };
  
  enum qstage {
@@ -385,6 +369,11 @@ struct vc4_compile {
  
          uint8_t vattr_sizes[8];
  
+        /* Bitfield for whether a given channel of a sampler needs sRGB
+         * decode.
+         */
+        uint8_t tex_srgb_decode[VC4_MAX_TEXTURE_SAMPLERS];
+
          /**
           * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
           *
@@ -459,12 +448,15 @@ void qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst);
  struct qreg qir_uniform(struct vc4_compile *c,
                          enum quniform_contents contents,
                          uint32_t data);
+void qir_schedule_instructions(struct vc4_compile *c);
  void qir_reorder_uniforms(struct vc4_compile *c);
  
  void qir_emit(struct vc4_compile *c, struct qinst *inst);
-static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
+static inline struct qinst *
+qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
  {
          list_addtail(&inst->link, &c->instructions);
+        return inst;
  }
  
  struct qreg qir_get_temp(struct vc4_compile *c);
@@ -535,11 +527,12 @@ qir_##name(struct vc4_compile *c, struct qreg a)                         \
          qir_emit(c, qir_inst(QOP_##name, t, a, c->undef));               \
          return t;                                                        \
  }                                                                        \
-static inline void                                                       \
+static inline struct qinst *                                             \
  qir_##name##_dest(struct vc4_compile *c, struct qreg dest,               \
                    struct qreg a)                                         \
  {                                                                        \
-        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef));      \
+        return qir_emit_nodef(c, qir_inst(QOP_##name, dest, a,           \
+                                          c->undef));                    \
  }
  
  #define QIR_ALU2(name)                                                   \
@@ -591,18 +584,6 @@ QIR_ALU2(V8MAX)
  QIR_ALU2(V8ADDS)
  QIR_ALU2(V8SUBS)
  QIR_ALU2(MUL24)
-QIR_ALU1(SEL_X_0_ZS)
-QIR_ALU1(SEL_X_0_ZC)
-QIR_ALU1(SEL_X_0_NS)
-QIR_ALU1(SEL_X_0_NC)
-QIR_ALU1(SEL_X_0_CS)
-QIR_ALU1(SEL_X_0_CC)
-QIR_ALU2(SEL_X_Y_ZS)
-QIR_ALU2(SEL_X_Y_ZC)
-QIR_ALU2(SEL_X_Y_NS)
-QIR_ALU2(SEL_X_Y_NC)
-QIR_ALU2(SEL_X_Y_CS)
-QIR_ALU2(SEL_X_Y_CC)
  QIR_ALU2(FMIN)
  QIR_ALU2(FMAX)
  QIR_ALU2(FMINABS)
@@ -647,6 +628,17 @@ QIR_NODST_1(TLB_STENCIL_SETUP)
  QIR_NODST_1(MS_MASK)
  
  static inline struct qreg
+qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1)
+{
+        struct qreg t = qir_get_temp(c);
+        struct qinst *a = qir_MOV_dest(c, t, src0);
+        struct qinst *b = qir_MOV_dest(c, t, src1);
+        a->cond = cond;
+        b->cond = cond ^ 1;
+        return t;
+}
+
+static inline struct qreg
  qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
  {
          struct qreg t = qir_FMOV(c, src);
diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c

new file mode 100644 (file)

index 0000000..2f280c5
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -0,0 +1,618 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc4_qir_schedule.c
+ *
+ * The basic model of the list scheduler is to take a basic block, compute a
+ * DAG of the dependencies from the bottom up, and make a list of the DAG
+ * heads.  Heuristically pick a DAG head and schedule (remove) it, then put
+ * all the parents that are now DAG heads into the list of things to
+ * schedule.
+ *
+ * The goal of scheduling here, before register allocation and conversion to
+ * QPU instructions, is to reduce register pressure by reordering instructions
+ * to consume values when possible.
+ */
+
+#include "vc4_qir.h"
+
+static bool debug;
+
+struct schedule_node {
+        struct list_head link;
+        struct qinst *inst;
+
+        struct schedule_node **children;
+        uint32_t child_count;
+        uint32_t child_array_size;
+        uint32_t parent_count;
+
+        /* Length of the longest (latency) chain from a DAG head to the this
+         * instruction.
+         */
+        uint32_t delay;
+
+        /* Longest time + latency_between(parent, this) of any parent of this
+         * node.
+         */
+        uint32_t unblocked_time;
+};
+
+struct schedule_state {
+        /* List of struct schedule_node *.  This starts out with all
+         * instructions, and after dependency updates it's trimmed to be just
+         * the DAG heads.
+         */
+        struct list_head worklist;
+
+        uint32_t time;
+
+        uint32_t *temp_writes;
+
+        BITSET_WORD *temp_live;
+};
+
+/* When walking the instructions in reverse, we need to swap before/after in
+ * add_dep().
+ */
+enum direction { F, R };
+
+/**
+ * Marks a dependency between two intructions, that @after must appear after
+ * @before.
+ *
+ * Our dependencies are tracked as a DAG.  Since we're scheduling bottom-up,
+ * the latest instructions with nothing left to schedule are the DAG heads,
+ * and their inputs are their children.
+ */
+static void
+add_dep(enum direction dir,
+        struct schedule_node *before,
+        struct schedule_node *after)
+{
+        if (!before || !after)
+                return;
+
+        assert(before != after);
+
+        if (dir == R) {
+                struct schedule_node *t = before;
+                before = after;
+                after = t;
+        }
+
+        for (int i = 0; i < after->child_count; i++) {
+                if (after->children[i] == after)
+                        return;
+        }
+
+        if (after->child_array_size <= after->child_count) {
+                after->child_array_size = MAX2(after->child_array_size * 2, 16);
+                after->children = reralloc(after, after->children,
+                                           struct schedule_node *,
+                                           after->child_array_size);
+        }
+
+        after->children[after->child_count] = before;
+        after->child_count++;
+        before->parent_count++;
+}
+
+static void
+add_write_dep(enum direction dir,
+              struct schedule_node **before,
+              struct schedule_node *after)
+{
+        add_dep(dir, *before, after);
+        *before = after;
+}
+
+struct schedule_setup_state {
+        struct schedule_node **last_temp_write;
+        struct schedule_node *last_sf;
+        struct schedule_node *last_vary_read;
+        struct schedule_node *last_vpm_read;
+        struct schedule_node *last_vpm_write;
+        struct schedule_node *last_tex_coord;
+        struct schedule_node *last_tex_result;
+        struct schedule_node *last_tlb;
+        enum direction dir;
+
+       /**
+         * Texture FIFO tracking.  This is done top-to-bottom, and is used to
+         * track the QOP_TEX_RESULTs and add dependencies on previous ones
+         * when trying to submit texture coords with TFREQ full or new texture
+         * fetches with TXRCV full.
+         */
+        struct {
+                struct schedule_node *node;
+                int coords;
+        } tex_fifo[8];
+        int tfreq_count; /**< Number of texture coords outstanding. */
+        int tfrcv_count; /**< Number of texture results outstanding. */
+        int tex_fifo_pos;
+};
+
+static void
+block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
+{
+        add_dep(state->dir, state->tex_fifo[0].node, n);
+
+        state->tfreq_count -= state->tex_fifo[0].coords;
+        state->tfrcv_count--;
+
+        memmove(&state->tex_fifo[0],
+                &state->tex_fifo[1],
+                state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
+        state->tex_fifo_pos--;
+}
+
+/**
+ * Common code for dependencies that need to be tracked both forward and
+ * backward.
+ *
+ * This is for things like "all VPM reads have to happen in order."
+ */
+static void
+calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
+{
+        struct qinst *inst = n->inst;
+        enum direction dir = state->dir;
+
+
+        /* Add deps for temp registers and varyings accesses.  Note that we
+         * ignore uniforms accesses, because qir_reorder_uniforms() happens
+         * after this.
+         */
+        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                switch (inst->src[i].file) {
+                case QFILE_TEMP:
+                        add_dep(dir,
+                                state->last_temp_write[inst->src[i].index], n);
+                        break;
+
+                case QFILE_VARY:
+                        add_write_dep(dir, &state->last_vary_read, n);
+                        break;
+
+                case QFILE_VPM:
+                        add_write_dep(dir, &state->last_vpm_read, n);
+                        break;
+
+                default:
+                        break;
+                }
+        }
+
+        switch (inst->op) {
+        case QOP_VARY_ADD_C:
+                add_dep(dir, state->last_vary_read, n);
+                break;
+
+        case QOP_TEX_S:
+        case QOP_TEX_T:
+        case QOP_TEX_R:
+        case QOP_TEX_B:
+        case QOP_TEX_DIRECT:
+                /* Texturing setup gets scheduled in order, because
+                 * the uniforms referenced by them have to land in a
+                 * specific order.
+                 */
+                add_write_dep(dir, &state->last_tex_coord, n);
+                break;
+
+        case QOP_TEX_RESULT:
+                /* Results have to be fetched in order. */
+                add_write_dep(dir, &state->last_tex_result, n);
+                break;
+
+        case QOP_TLB_COLOR_WRITE:
+        case QOP_TLB_COLOR_READ:
+        case QOP_TLB_Z_WRITE:
+        case QOP_TLB_STENCIL_SETUP:
+        case QOP_MS_MASK:
+                add_write_dep(dir, &state->last_tlb, n);
+                break;
+
+        case QOP_TLB_DISCARD_SETUP:
+                add_write_dep(dir, &state->last_sf, n);
+                add_write_dep(dir, &state->last_tlb, n);
+                break;
+
+        default:
+                break;
+        }
+
+        if (inst->dst.file == QFILE_VPM)
+                add_write_dep(dir, &state->last_vpm_write, n);
+        else if (inst->dst.file == QFILE_TEMP)
+                add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
+
+        if (qir_depends_on_flags(inst))
+                add_dep(dir, state->last_sf, n);
+
+        if (inst->sf)
+                add_write_dep(dir, &state->last_sf, n);
+}
+
+static void
+calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
+                       struct list_head *schedule_list)
+{
+        struct schedule_setup_state state;
+
+        memset(&state, 0, sizeof(state));
+        state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
+                                              c->num_temps);
+        state.dir = F;
+
+        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+                struct qinst *inst = n->inst;
+
+                calculate_deps(&state, n);
+
+                switch (inst->op) {
+                case QOP_TEX_S:
+                case QOP_TEX_T:
+                case QOP_TEX_R:
+                case QOP_TEX_B:
+                case QOP_TEX_DIRECT:
+                        /* If the texture coordinate fifo is full,
+                         * block this on the last QOP_TEX_RESULT.
+                         */
+                        if (state.tfreq_count == 8) {
+                                block_until_tex_result(&state, n);
+                        }
+
+                        /* If the texture result fifo is full, block
+                         * adding any more to it until the last
+                         * QOP_TEX_RESULT.
+                         */
+                        if (inst->op == QOP_TEX_S ||
+                            inst->op == QOP_TEX_DIRECT) {
+                                if (state.tfrcv_count == 4)
+                                        block_until_tex_result(&state, n);
+                                state.tfrcv_count++;
+                        }
+
+                        state.tex_fifo[state.tex_fifo_pos].coords++;
+                        state.tfreq_count++;
+                        break;
+
+                case QOP_TEX_RESULT:
+                        /* Results have to be fetched after the
+                         * coordinate setup.  Note that we're assuming
+                         * here that our input shader has the texture
+                         * coord setup and result fetch in order,
+                         * which is true initially but not of our
+                         * instruction stream after this pass.
+                         */
+                        add_dep(state.dir, state.last_tex_coord, n);
+
+                        state.tex_fifo[state.tex_fifo_pos].node = n;
+
+                        state.tex_fifo_pos++;
+                        memset(&state.tex_fifo[state.tex_fifo_pos], 0,
+                               sizeof(state.tex_fifo[0]));
+                        break;
+                default:
+                        assert(!qir_is_tex(inst));
+                        break;
+                }
+        }
+}
+
+static void
+calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
+                       struct list_head *schedule_list)
+{
+        struct schedule_setup_state state;
+
+        memset(&state, 0, sizeof(state));
+        state.dir = R;
+        state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
+                                              c->num_temps);
+
+        list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
+                calculate_deps(&state, n);
+        }
+}
+
+static int
+get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
+{
+        int cost = 0;
+
+        if (inst->dst.file == QFILE_TEMP &&
+            state->temp_writes[inst->dst.index] == 1)
+                cost--;
+
+        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                if (inst->src[i].file == QFILE_TEMP &&
+                    !BITSET_TEST(state->temp_live, inst->src[i].index)) {
+                        cost++;
+                }
+        }
+
+        return cost;
+}
+
+static bool
+locks_scoreboard(struct qinst *inst)
+{
+        switch (inst->op) {
+        case QOP_TLB_Z_WRITE:
+        case QOP_TLB_COLOR_WRITE:
+        case QOP_TLB_COLOR_WRITE_MS:
+        case QOP_TLB_COLOR_READ:
+                return true;
+        default:
+                return false;
+        }
+}
+
+static struct schedule_node *
+choose_instruction(struct schedule_state *state)
+{
+        struct schedule_node *chosen = NULL;
+
+        list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+                if (!chosen) {
+                        chosen = n;
+                        continue;
+                }
+
+                /* Prefer scheduling things that lock the scoreboard, so that
+                 * they appear late in the program and we get more parallelism
+                 * between shaders on multiple QPUs hitting the same fragment.
+                 */
+                if (locks_scoreboard(n->inst) &&
+                    !locks_scoreboard(chosen->inst)) {
+                        chosen = n;
+                        continue;
+                } else if (!locks_scoreboard(n->inst) &&
+                           locks_scoreboard(chosen->inst)) {
+                        continue;
+                }
+
+                /* If we would block on the previously chosen node, but would
+                 * block less on this one, then then prefer it.
+                 */
+                if (chosen->unblocked_time > state->time &&
+                    n->unblocked_time < chosen->unblocked_time) {
+                        chosen = n;
+                        continue;
+                } else if (n->unblocked_time > state->time &&
+                           n->unblocked_time > chosen->unblocked_time) {
+                        continue;
+                }
+
+                /* If we can definitely reduce register pressure, do so
+                 * immediately.
+                 */
+                int register_pressure_cost =
+                        get_register_pressure_cost(state, n->inst);
+                int chosen_register_pressure_cost =
+                        get_register_pressure_cost(state, chosen->inst);
+
+                if (register_pressure_cost < chosen_register_pressure_cost) {
+                        chosen = n;
+                        continue;
+                } else if (register_pressure_cost >
+                           chosen_register_pressure_cost) {
+                        continue;
+                }
+
+                /* Otherwise, prefer instructions with the deepest chain to
+                 * the end of the program.  This avoids the problem of
+                 * "everything generates a temp, nothing finishes freeing one,
+                 * guess I'll just keep emitting varying mul/adds".
+                 */
+                if (n->delay > chosen->delay) {
+                        chosen = n;
+                        continue;
+                } else if (n->delay < chosen->delay) {
+                        continue;
+                }
+        }
+
+        return chosen;
+}
+
+static void
+dump_state(struct vc4_compile *c, struct schedule_state *state)
+{
+        uint32_t i = 0;
+        list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+                fprintf(stderr, "%3d: ", i++);
+                qir_dump_inst(c, n->inst);
+                fprintf(stderr, " (%d cost)\n",
+                        get_register_pressure_cost(state, n->inst));
+
+                for (int i = 0; i < n->child_count; i++) {
+                        struct schedule_node *child = n->children[i];
+                        fprintf(stderr, "   - ");
+                        qir_dump_inst(c, child->inst);
+                        fprintf(stderr, " (%d parents)\n", child->parent_count);
+                }
+        }
+}
+
+/* Estimate of how many instructions we should schedule between operations.
+ *
+ * These aren't in real cycle counts, because we're just estimating cycle
+ * times anyway.  QIR instructions will get paired up when turned into QPU
+ * instructions, or extra NOP delays will have to be added due to register
+ * allocation choices.
+ */
+static uint32_t
+latency_between(struct schedule_node *before, struct schedule_node *after)
+{
+        if ((before->inst->op == QOP_TEX_S ||
+             before->inst->op == QOP_TEX_DIRECT) &&
+            after->inst->op == QOP_TEX_RESULT)
+                return 100;
+
+        return 1;
+}
+
+/** Recursive computation of the delay member of a node. */
+static void
+compute_delay(struct schedule_node *n)
+{
+        if (!n->child_count) {
+                /* The color read needs to be scheduled late, to avoid locking
+                 * the scoreboard early.  This is our best tool for
+                 * encouraging that.  The other scoreboard locking ops will
+                 * have this happen by default, since they are generally the
+                 * DAG heads or close to them.
+                 */
+                if (n->inst->op == QOP_TLB_COLOR_READ)
+                        n->delay = 1000;
+                else
+                        n->delay = 1;
+        } else {
+                for (int i = 0; i < n->child_count; i++) {
+                        if (!n->children[i]->delay)
+                                compute_delay(n->children[i]);
+                        n->delay = MAX2(n->delay,
+                                        n->children[i]->delay +
+                                        latency_between(n, n->children[i]));
+                }
+        }
+}
+
+static void
+schedule_instructions(struct vc4_compile *c, struct schedule_state *state)
+{
+        if (debug) {
+                fprintf(stderr, "initial deps:\n");
+                dump_state(c, state);
+        }
+
+        /* Remove non-DAG heads from the list. */
+        list_for_each_entry_safe(struct schedule_node, n,
+                                 &state->worklist, link) {
+                if (n->parent_count != 0)
+                        list_del(&n->link);
+        }
+
+        state->time = 0;
+        while (!list_empty(&state->worklist)) {
+                struct schedule_node *chosen = choose_instruction(state);
+                struct qinst *inst = chosen->inst;
+
+                if (debug) {
+                        fprintf(stderr, "current list:\n");
+                        dump_state(c, state);
+                        fprintf(stderr, "chose: ");
+                        qir_dump_inst(c, inst);
+                        fprintf(stderr, " (%d cost)\n",
+                                get_register_pressure_cost(state, inst));
+                }
+
+                state->time = MAX2(state->time, chosen->unblocked_time);
+
+                /* Schedule this instruction back onto the QIR list. */
+                list_del(&chosen->link);
+                list_add(&inst->link, &c->instructions);
+
+                /* Now that we've scheduled a new instruction, some of its
+                 * children can be promoted to the list of instructions ready to
+                 * be scheduled.  Update the children's unblocked time for this
+                 * DAG edge as we do so.
+                 */
+                for (int i = chosen->child_count - 1; i >= 0; i--) {
+                        struct schedule_node *child = chosen->children[i];
+
+                        child->unblocked_time = MAX2(child->unblocked_time,
+                                                     state->time +
+                                                     latency_between(chosen,
+                                                                     child));
+                        child->parent_count--;
+                        if (child->parent_count == 0)
+                                list_add(&child->link, &state->worklist);
+                }
+
+                /* Update our tracking of register pressure. */
+                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                        if (inst->src[i].file == QFILE_TEMP)
+                                BITSET_SET(state->temp_live, inst->src[i].index);
+                }
+                if (inst->dst.file == QFILE_TEMP) {
+                        state->temp_writes[inst->dst.index]--;
+                        if (state->temp_writes[inst->dst.index] == 0)
+                                BITSET_CLEAR(state->temp_live, inst->dst.index);
+                }
+
+                state->time++;
+        }
+}
+
+void
+qir_schedule_instructions(struct vc4_compile *c)
+{
+        void *mem_ctx = ralloc_context(NULL);
+        struct schedule_state state = { 0 };
+
+        if (debug) {
+                fprintf(stderr, "Pre-schedule instructions\n");
+                qir_dump(c);
+        }
+
+        state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps);
+        state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD,
+                                        BITSET_WORDS(c->num_temps));
+        list_inithead(&state.worklist);
+
+        /* Wrap each instruction in a scheduler structure. */
+        list_for_each_entry_safe(struct qinst, inst, &c->instructions, link) {
+                struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
+
+                n->inst = inst;
+                list_del(&inst->link);
+                list_addtail(&n->link, &state.worklist);
+
+                if (inst->dst.file == QFILE_TEMP)
+                        state.temp_writes[inst->dst.index]++;
+        }
+
+        /* Dependencies tracked top-to-bottom. */
+        calculate_forward_deps(c, mem_ctx, &state.worklist);
+        /* Dependencies tracked bottom-to-top. */
+        calculate_reverse_deps(c, mem_ctx, &state.worklist);
+
+        list_for_each_entry(struct schedule_node, n, &state.worklist, link)
+                compute_delay(n);
+
+        schedule_instructions(c, &state);
+
+        if (debug) {
+                fprintf(stderr, "Post-schedule instructions\n");
+                qir_dump(c);
+        }
+
+        ralloc_free(mem_ctx);
+}
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c

index cb4e0cf..b06702a 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -64,6 +64,12 @@ set_last_cond_add(struct vc4_compile *c, uint32_t cond)
          *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
  }
  
+static void
+set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
+{
+        *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
+}
+
  /**
   * Some special registers can be read from either file, which lets us resolve
   * raddr conflicts without extra MOVs.
@@ -306,42 +312,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                          break;
                  }
  
-                switch (qinst->op) {
-                case QOP_SEL_X_0_ZS:
-                case QOP_SEL_X_0_ZC:
-                case QOP_SEL_X_0_NS:
-                case QOP_SEL_X_0_NC:
-                case QOP_SEL_X_0_CS:
-                case QOP_SEL_X_0_CC:
-                        queue(c, qpu_a_MOV(dst, src[0]) | unpack);
-                        set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
-                                          QPU_COND_ZS);
-
-                        queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
-                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
-                                              1) + QPU_COND_ZS);
-                        break;
-
-                case QOP_SEL_X_Y_ZS:
-                case QOP_SEL_X_Y_ZC:
-                case QOP_SEL_X_Y_NS:
-                case QOP_SEL_X_Y_NC:
-                case QOP_SEL_X_Y_CS:
-                case QOP_SEL_X_Y_CC:
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        if (qinst->src[0].pack)
-                                *(last_inst(c)) |= unpack;
-                        set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
-                                          QPU_COND_ZS);
-
-                        queue(c, qpu_a_MOV(dst, src[1]));
-                        if (qinst->src[1].pack)
-                                *(last_inst(c)) |= unpack;
-                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
-                                              1) + QPU_COND_ZS);
-
-                        break;
+                bool handled_qinst_cond = true;
  
+                switch (qinst->op) {
                  case QOP_RCP:
                  case QOP_RSQ:
                  case QOP_EXP2:
@@ -497,16 +470,22 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                  queue(c, qpu_m_alu2(translate[qinst->op].op,
                                                      dst,
                                                      src[0], src[1]) | unpack);
+                                set_last_cond_mul(c, qinst->cond);
                          } else {
                                  queue(c, qpu_a_alu2(translate[qinst->op].op,
                                                      dst,
                                                      src[0], src[1]) | unpack);
+                                set_last_cond_add(c, qinst->cond);
                          }
+                        handled_qinst_cond = true;
                          set_last_dst_pack(c, qinst);
  
                          break;
                  }
  
+                assert(qinst->cond == QPU_COND_ALWAYS ||
+                       handled_qinst_cond);
+
                  if (qinst->sf) {
                          assert(!qir_is_multi_instruction(qinst));
                          *last_inst(c) |= QPU_SF;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c

index 98b7b60..09164b7 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -50,7 +50,7 @@ struct schedule_node {
          uint32_t child_array_size;
          uint32_t parent_count;
  
-        /* Longest cycles + n->latency of any parent of this node. */
+        /* Longest cycles + instruction_latency() of any parent of this node. */
          uint32_t unblocked_time;
  
          /**
@@ -259,7 +259,8 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
                  }
          } else if (is_tmu_write(waddr)) {
                  add_write_dep(state, &state->last_tmu_write, n);
-        } else if (qpu_waddr_is_tlb(waddr)) {
+        } else if (qpu_waddr_is_tlb(waddr) ||
+                   waddr == QPU_W_MS_FLAGS) {
                  add_write_dep(state, &state->last_tlb, n);
          } else {
                  switch (waddr) {
@@ -623,6 +624,46 @@ dump_state(struct list_head *schedule_list)
          }
  }
  
+static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
+{
+        if (waddr < 32)
+                return 2;
+
+        /* Apply some huge latency between texture fetch requests and getting
+         * their results back.
+         */
+        if (waddr == QPU_W_TMU0_S) {
+                if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
+                        return 100;
+        }
+        if (waddr == QPU_W_TMU1_S) {
+                if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
+                        return 100;
+        }
+
+        switch(waddr) {
+        case QPU_W_SFU_RECIP:
+        case QPU_W_SFU_RECIPSQRT:
+        case QPU_W_SFU_EXP:
+        case QPU_W_SFU_LOG:
+                return 3;
+        default:
+                return 1;
+        }
+}
+
+static uint32_t
+instruction_latency(struct schedule_node *before, struct schedule_node *after)
+{
+        uint64_t before_inst = before->inst->inst;
+        uint64_t after_inst = after->inst->inst;
+
+        return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
+                                  after_inst),
+                    waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
+                                  after_inst));
+}
+
  /** Recursive computation of the delay member of a node. */
  static void
  compute_delay(struct schedule_node *n)
@@ -634,7 +675,8 @@ compute_delay(struct schedule_node *n)
                          if (!n->children[i].node->delay)
                                  compute_delay(n->children[i].node);
                          n->delay = MAX2(n->delay,
-                                        n->children[i].node->delay + n->latency);
+                                        n->children[i].node->delay +
+                                        instruction_latency(n, n->children[i].node));
                  }
          }
  }
@@ -663,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list,
                   * immediately after (or paired with!) the thing reading the
                   * destination.
                   */
-                int latency_from_previous = war_only ? 0 : node->latency;
+                uint32_t latency = 0;
+                if (!war_only) {
+                        latency = instruction_latency(node,
+                                                      node->children[i].node);
+                }
+
                  child->unblocked_time = MAX2(child->unblocked_time,
-                                             time + latency_from_previous);
+                                             time + latency);
                  child->parent_count--;
                  if (child->parent_count == 0)
                          list_add(&child->link, schedule_list);
@@ -798,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
          return time;
  }
  
-static uint32_t waddr_latency(uint32_t waddr)
-{
-        if (waddr < 32)
-                return 2;
-
-        /* Some huge number, really. */
-        if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
-                return 100;
-
-        switch(waddr) {
-        case QPU_W_SFU_RECIP:
-        case QPU_W_SFU_RECIPSQRT:
-        case QPU_W_SFU_EXP:
-        case QPU_W_SFU_LOG:
-                return 3;
-        default:
-                return 1;
-        }
-}
-
-static uint32_t
-instruction_latency(uint64_t inst)
-{
-        return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)),
-                    waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
-}
-
  uint32_t
  qpu_schedule_instructions(struct vc4_compile *c)
  {
@@ -851,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c)
                  struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
  
                  n->inst = inst;
-                n->latency = instruction_latency(inst->inst);
  
                  if (reads_uniform(inst->inst)) {
                          n->uniform = next_uniform++;
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c

index e7069a4..036da32 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -207,7 +207,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
          /* If the resource is multisampled, we need to resolve to single
           * sample.  This seems like it should be handled at a higher layer.
           */
-        if (prsc->nr_samples) {
+        if (prsc->nr_samples > 1) {
                  trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box);
                  if (!trans->ss_resource)
                          goto fail;
@@ -377,7 +377,7 @@ vc4_setup_slices(struct vc4_resource *rsc)
  
                  if (!rsc->tiled) {
                          slice->tiling = VC4_TILING_FORMAT_LINEAR;
-                        if (prsc->nr_samples) {
+                        if (prsc->nr_samples > 1) {
                                  /* MSAA (4x) surfaces are stored as raw tile buffer contents. */
                                  level_width = align(level_width, 32);
                                  level_height = align(level_height, 32);
@@ -458,7 +458,7 @@ vc4_resource_setup(struct pipe_screen *pscreen,
          prsc->screen = pscreen;
  
          rsc->base.vtbl = &vc4_resource_vtbl;
-        if (prsc->nr_samples == 0)
+        if (prsc->nr_samples <= 1)
                  rsc->cpp = util_format_get_blocksize(tmpl->format);
          else
                  rsc->cpp = sizeof(uint32_t);
@@ -475,7 +475,7 @@ get_resource_texture_format(struct pipe_resource *prsc)
          uint8_t format = vc4_get_tex_format(prsc->format);
  
          if (!rsc->tiled) {
-                if (prsc->nr_samples) {
+                if (prsc->nr_samples > 1) {
                          return ~0;
                  } else {
                          assert(format == VC4_TEXTURE_TYPE_RGBA8888);
@@ -497,7 +497,7 @@ vc4_resource_create(struct pipe_screen *pscreen,
           * communicate metadata about tiling currently.
           */
          if (tmpl->target == PIPE_BUFFER ||
-            tmpl->nr_samples ||
+            tmpl->nr_samples > 1 ||
              (tmpl->bind & (PIPE_BIND_SCANOUT |
                             PIPE_BIND_LINEAR |
                             PIPE_BIND_SHARED |
@@ -832,7 +832,7 @@ vc4_dump_surface(struct pipe_surface *psurf)
          if (!psurf)
                  return;
  
-        if (psurf->texture->nr_samples)
+        if (psurf->texture->nr_samples > 1)
                  vc4_dump_surface_msaa(psurf);
          else
                  vc4_dump_surface_non_msaa(psurf);
@@ -921,7 +921,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
  
          void *data;
          struct pipe_resource *shadow_rsc = NULL;
-        u_upload_alloc(vc4->uploader, 0, count * 2,
+        u_upload_alloc(vc4->uploader, 0, count * 2, 4,
                         shadow_offset, &shadow_rsc, &data);
          uint16_t *dst = data;
  
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c

index 090579c..fb41877 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -57,6 +57,10 @@ static const struct debug_named_value debug_options[] = {
            "Flush after each draw call" },
          { "always_sync", VC4_DEBUG_ALWAYS_SYNC,
            "Wait for finish after each flush" },
+#if USE_VC4_SIMULATOR
+        { "dump", VC4_DEBUG_DUMP,
+          "Write a GPU command stream trace file" },
+#endif
          { NULL }
  };
  
@@ -96,6 +100,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
          case PIPE_CAP_TWO_SIDED_STENCIL:
          case PIPE_CAP_USER_INDEX_BUFFERS:
          case PIPE_CAP_TEXTURE_MULTISAMPLE:
+        case PIPE_CAP_TEXTURE_SWIZZLE:
                  return 1;
  
                  /* lying for GL 2.0 */
@@ -124,7 +129,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
          case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
          case PIPE_CAP_CUBE_MAP_ARRAY:
          case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-        case PIPE_CAP_TEXTURE_SWIZZLE:
          case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
          case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
          case PIPE_CAP_SEAMLESS_CUBE_MAP:
@@ -167,6 +171,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
          case PIPE_CAP_MAX_TEXEL_OFFSET:
          case PIPE_CAP_MAX_VERTEX_STREAMS:
          case PIPE_CAP_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT:
+        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
          case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
          case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
          case PIPE_CAP_SAMPLER_VIEW_TARGET:
@@ -176,15 +182,22 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
          case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
          case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
          case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-       case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-       case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-       case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-       case PIPE_CAP_DEPTH_BOUNDS_TEST:
-       case PIPE_CAP_TGSI_TXQS:
-       case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-       case PIPE_CAP_SHAREABLE_SHADERS:
-       case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
-       case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+        case PIPE_CAP_DEPTH_BOUNDS_TEST:
+        case PIPE_CAP_TGSI_TXQS:
+        case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+        case PIPE_CAP_SHAREABLE_SHADERS:
+        case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+        case PIPE_CAP_CLEAR_TEXTURE:
+        case PIPE_CAP_DRAW_PARAMETERS:
+        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+        case PIPE_CAP_INVALIDATE_BUFFER:
+        case PIPE_CAP_GENERATE_MIPMAP:
                  return 0;
  
                  /* Stream output. */
@@ -341,6 +354,8 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                  return PIPE_SHADER_IR_TGSI;
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
                 return 32;
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+                return 0;
          default:
                  fprintf(stderr, "unknown shader param %d\n", param);
                  return 0;
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h

index 5992e37..03f76b2 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -41,6 +41,7 @@ struct vc4_bo;
  #define VC4_DEBUG_ALWAYS_FLUSH 0x0080
  #define VC4_DEBUG_ALWAYS_SYNC  0x0100
  #define VC4_DEBUG_NIR       0x0200
+#define VC4_DEBUG_DUMP      0x0400
  
  #define VC4_MAX_MIP_LEVELS 12
  #define VC4_MAX_TEXTURE_SAMPLERS 16
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c

index 4b1df92..521ef50 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -131,6 +131,93 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
          return 0;
  }
  
+static void
+vc4_dump_to_file(struct vc4_exec_info *exec)
+{
+        static int dumpno = 0;
+        struct drm_vc4_get_hang_state *state;
+        struct drm_vc4_get_hang_state_bo *bo_state;
+        unsigned int dump_version = 0;
+
+        if (!(vc4_debug & VC4_DEBUG_DUMP))
+                return;
+
+        state = calloc(1, sizeof(*state));
+
+        int unref_count = 0;
+        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                unref_count++;
+        }
+
+        /* Add one more for the overflow area that isn't wrapped in a BO. */
+        state->bo_count = exec->bo_count + unref_count + 1;
+        bo_state = calloc(state->bo_count, sizeof(*bo_state));
+
+        char *filename = NULL;
+        asprintf(&filename, "vc4-dri-%d.dump", dumpno++);
+        FILE *f = fopen(filename, "w+");
+        if (!f) {
+                fprintf(stderr, "Couldn't open %s: %s", filename,
+                        strerror(errno));
+                return;
+        }
+
+        fwrite(&dump_version, sizeof(dump_version), 1, f);
+
+        state->ct0ca = exec->ct0ca;
+        state->ct0ea = exec->ct0ea;
+        state->ct1ca = exec->ct1ca;
+        state->ct1ea = exec->ct1ea;
+        state->start_bin = exec->ct0ca;
+        state->start_render = exec->ct1ca;
+        fwrite(state, sizeof(*state), 1, f);
+
+        int i;
+        for (i = 0; i < exec->bo_count; i++) {
+                struct drm_gem_cma_object *cma_bo = exec->bo[i];
+                bo_state[i].handle = i; /* Not used by the parser. */
+                bo_state[i].paddr = cma_bo->paddr;
+                bo_state[i].size = cma_bo->base.size;
+        }
+
+        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                struct drm_gem_cma_object *cma_bo = &bo->base;
+                bo_state[i].handle = 0;
+                bo_state[i].paddr = cma_bo->paddr;
+                bo_state[i].size = cma_bo->base.size;
+                i++;
+        }
+
+        /* Add the static overflow memory area. */
+        bo_state[i].handle = exec->bo_count;
+        bo_state[i].paddr = 0;
+        bo_state[i].size = OVERFLOW_SIZE;
+        i++;
+
+        fwrite(bo_state, sizeof(*bo_state), state->bo_count, f);
+
+        for (int i = 0; i < exec->bo_count; i++) {
+                struct drm_gem_cma_object *cma_bo = exec->bo[i];
+                fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
+        }
+
+        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                struct drm_gem_cma_object *cma_bo = &bo->base;
+                fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
+        }
+
+        void *overflow = calloc(1, OVERFLOW_SIZE);
+        fwrite(overflow, 1, OVERFLOW_SIZE, f);
+        free(overflow);
+
+        free(state);
+        free(bo_state);
+        fclose(f);
+}
+
  int
  vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
  {
@@ -183,6 +270,8 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
                              exec.ct1ea - exec.ct1ca, true);
          }
  
+        vc4_dump_to_file(&exec);
+
          if (exec.ct0ca != exec.ct0ea) {
                  int bfc = simpenrose_do_binning(exec.ct0ca, exec.ct0ea);
                  if (bfc != 1) {
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h

index 40d3ada..e1f8b5a 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -45,7 +45,7 @@ struct vc4_exec_info;
  #define roundup(x, y) align(x, y)
  #define round_up(x, y) align(x, y)
  #define max(x, y) MAX2(x, y)
-#define min(x, y) MiN2(x, y)
+#define min(x, y) MIN2(x, y)
  #define BUG_ON(condition) assert(!(condition))
  
  static inline int
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c

index d9c0f55..7e0ada7 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -462,9 +462,9 @@ vc4_set_framebuffer_state(struct pipe_context *pctx,
  
          vc4->msaa = false;
          if (cso->cbufs[0])
-                vc4->msaa = cso->cbufs[0]->texture->nr_samples != 0;
+                vc4->msaa = cso->cbufs[0]->texture->nr_samples > 1;
          else if (cso->zsbuf)
-                vc4->msaa = cso->zsbuf->texture->nr_samples != 0;
+                vc4->msaa = cso->zsbuf->texture->nr_samples > 1;
  
          if (vc4->msaa) {
                  vc4->tile_width = 32;
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c

index 527f763..c322503 100644 (file)
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -605,7 +605,7 @@ static void virgl_draw_vbo(struct pipe_context *ctx,
             ib.offset = vctx->index_buffer.offset + info.start * ib.index_size;
  
             if (ib.user_buffer) {
-                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size,
+                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size, 256,
                                   ib.user_buffer, &ib.offset, &ib.buffer);
                     ib.user_buffer = NULL;
             }
@@ -948,8 +948,8 @@ struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
                      16, UTIL_SLAB_SINGLETHREADED);
  
     vctx->primconvert = util_primconvert_create(&vctx->base, rs->caps.caps.v1.prim_mask);
-   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024, 256,
-                                     PIPE_BIND_INDEX_BUFFER);
+   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024,
+                                     PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM);
     if (!vctx->uploader)
             goto fail;
  
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c

index 26a4f77..fb2e567 100644 (file)
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -201,6 +201,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
     case PIPE_CAP_MAX_VERTEX_STREAMS:
     case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
     case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
     case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
     case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
@@ -219,6 +221,13 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
     case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
     case PIPE_CAP_SHAREABLE_SHADERS:
     case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
        return 0;
     case PIPE_CAP_VENDOR_ID:
        return 0x1af4;
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h

index be7447d..4b551ed 100644 (file)
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -649,11 +649,15 @@ struct pipe_context {
                            struct pipe_resource *resource);
  
     /**
-    * Invalidate the contents of the resource.
+    * Invalidate the contents of the resource. This is used to
      *
-    * This is used to implement EGL's semantic of undefined depth/stencil
+    * (1) implement EGL's semantic of undefined depth/stencil
      * contenst after a swapbuffers.  This allows a tiled renderer (for
      * example) to not store the depth buffer.
+    *
+    * (2) implement GL's InvalidateBufferData. For backwards compatibility,
+    * you must only rely on the usability for this purpose when
+    * PIPE_CAP_INVALIDATE_BUFFER is enabled.
      */
     void (*invalidate_resource)(struct pipe_context *ctx,
                                 struct pipe_resource *resource);
@@ -673,6 +677,18 @@ struct pipe_context {
      */
     void (*dump_debug_state)(struct pipe_context *ctx, FILE *stream,
                              unsigned flags);
+
+   /**
+    * Generate mipmap.
+    * \return TRUE if mipmap generation succeeds, FALSE otherwise
+    */
+   boolean (*generate_mipmap)(struct pipe_context *ctx,
+                              struct pipe_resource *resource,
+                              enum pipe_format format,
+                              unsigned base_level,
+                              unsigned last_level,
+                              unsigned first_layer,
+                              unsigned last_layer);
  };
  
  
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h

index cbf0ba6..cb837cd 100644 (file)
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -635,6 +635,15 @@ enum pipe_cap
     PIPE_CAP_SHAREABLE_SHADERS,
     PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS,
     PIPE_CAP_CLEAR_TEXTURE,
+   PIPE_CAP_DRAW_PARAMETERS,
+   PIPE_CAP_TGSI_PACK_HALF_FLOAT,
+   PIPE_CAP_MULTI_DRAW_INDIRECT,
+   PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS,
+   PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL,
+   PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL,
+   PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT,
+   PIPE_CAP_INVALIDATE_BUFFER,
+   PIPE_CAP_GENERATE_MIPMAP,
  };
  
  #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -701,6 +710,7 @@ enum pipe_shader_cap
     PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED,
     PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE,
     PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT,
+   PIPE_SHADER_CAP_MAX_SHADER_BUFFERS,
  };
  
  /**
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h

index a3137ae..f300207 100644 (file)
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -76,8 +76,9 @@ enum tgsi_file_type {
     TGSI_FILE_IMMEDIATE           =7,
     TGSI_FILE_PREDICATE           =8,
     TGSI_FILE_SYSTEM_VALUE        =9,
-   TGSI_FILE_RESOURCE            =10,
+   TGSI_FILE_IMAGE               =10,
     TGSI_FILE_SAMPLER_VIEW        =11,
+   TGSI_FILE_BUFFER              =12,
     TGSI_FILE_COUNT      /**< how many TGSI_FILE_ types */
  };
  
@@ -127,7 +128,8 @@ struct tgsi_declaration
     unsigned Invariant   : 1;  /**< invariant optimization? */
     unsigned Local       : 1;  /**< optimize as subroutine local variable? */
     unsigned Array       : 1;  /**< extra array info? */
-   unsigned Padding     : 6;
+   unsigned Atomic      : 1;  /**< atomic only? for TGSI_FILE_BUFFER */
+   unsigned Padding     : 5;
  };
  
  struct tgsi_declaration_range
@@ -186,7 +188,9 @@ struct tgsi_declaration_interp
  #define TGSI_SEMANTIC_TESSINNER  33 /**< inner tessellation levels */
  #define TGSI_SEMANTIC_VERTICESIN 34 /**< number of input vertices */
  #define TGSI_SEMANTIC_HELPER_INVOCATION 35 /**< current invocation is helper */
-#define TGSI_SEMANTIC_COUNT      36 /**< number of semantic values */
+#define TGSI_SEMANTIC_BASEINSTANCE 36
+#define TGSI_SEMANTIC_DRAWID     37
+#define TGSI_SEMANTIC_COUNT      38 /**< number of semantic values */
  
  struct tgsi_declaration_semantic
  {
@@ -195,11 +199,12 @@ struct tgsi_declaration_semantic
     unsigned Padding        : 8;
  };
  
-struct tgsi_declaration_resource {
+struct tgsi_declaration_image {
     unsigned Resource    : 8; /**< one of TGSI_TEXTURE_ */
     unsigned Raw         : 1;
     unsigned Writable    : 1;
-   unsigned Padding     : 22;
+   unsigned Format      : 10; /**< one of PIPE_FORMAT_ */
+   unsigned Padding     : 12;
  };
  
  enum tgsi_return_type {
@@ -406,6 +411,7 @@ struct tgsi_property_data {
  #define TGSI_OPCODE_ENDSUB              102
  #define TGSI_OPCODE_TXQ_LZ              103 /* TXQ for mipmap level 0 */
  #define TGSI_OPCODE_TXQS                104
+#define TGSI_OPCODE_RESQ                105
                                  /* gap */
  #define TGSI_OPCODE_NOP                 107
  
@@ -567,7 +573,8 @@ struct tgsi_instruction
     unsigned Predicate  : 1;  /* BOOL */
     unsigned Label      : 1;
     unsigned Texture    : 1;
-   unsigned Padding    : 2;
+   unsigned Memory     : 1;
+   unsigned Padding    : 1;
  };
  
  /*
@@ -724,6 +731,19 @@ struct tgsi_dst_register
     unsigned Padding     : 6;
  };
  
+#define TGSI_MEMORY_COHERENT (1 << 0)
+#define TGSI_MEMORY_RESTRICT (1 << 1)
+#define TGSI_MEMORY_VOLATILE (1 << 2)
+
+/**
+ * Specifies the type of memory access to do for the LOAD/STORE instruction.
+ */
+struct tgsi_instruction_memory
+{
+   unsigned Qualifier : 3;  /* TGSI_MEMORY_ */
+   unsigned Padding   : 29;
+};
+
  
  #ifdef __cplusplus
  }
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h

index 6bdf03a..2e4d283 100644 (file)
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -587,6 +587,8 @@ struct pipe_draw_info
     unsigned start_instance; /**< first instance id */
     unsigned instance_count; /**< number of instances */
  
+   unsigned drawid; /**< id of this draw in a multidraw */
+
     unsigned vertices_per_patch; /**< the number of vertices per patch */
  
     /**
@@ -618,7 +620,7 @@ struct pipe_draw_info
      */
     struct pipe_stream_output_target *count_from_stream_output;
  
-   /* Indirect parameters resource: If not NULL, most values are taken
+   /* Indirect draw parameters resource: If not NULL, most values are taken
      * from this buffer instead, which is laid out as follows:
      *
      * if indexed is TRUE:
@@ -639,6 +641,15 @@ struct pipe_draw_info
      */
     struct pipe_resource *indirect;
     unsigned indirect_offset; /**< must be 4 byte aligned */
+   unsigned indirect_stride; /**< must be 4 byte aligned */
+   unsigned indirect_count; /**< number of indirect draws */
+
+   /* Indirect draw count resource: If not NULL, contains a 32-bit value which
+    * is to be used as the real indirect_count. In that case indirect_count
+    * becomes the maximum possible value.
+    */
+   struct pipe_resource *indirect_params;
+   unsigned indirect_params_offset; /**< must be 4 byte aligned */
  };
  
  
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c

index f14ffea..0be8365 100644 (file)
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -393,14 +393,15 @@ NineDevice9_ctor( struct NineDevice9 *This,
      This->driver_caps.user_cbufs = GET_PCAP(USER_CONSTANT_BUFFERS);
  
      if (!This->driver_caps.user_vbufs)
-        This->vertex_uploader = u_upload_create(This->pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+        This->vertex_uploader = u_upload_create(This->pipe, 65536,
+                                                PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM);
      if (!This->driver_caps.user_ibufs)
-        This->index_uploader = u_upload_create(This->pipe, 128 * 1024, 4, PIPE_BIND_INDEX_BUFFER);
+        This->index_uploader = u_upload_create(This->pipe, 128 * 1024,
+                                               PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM);
      if (!This->driver_caps.user_cbufs) {
-        unsigned alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
-
+        This->constbuf_alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
          This->constbuf_uploader = u_upload_create(This->pipe, This->vs_const_size,
-                                                  alignment, PIPE_BIND_CONSTANT_BUFFER);
+                                                  PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
      }
  
      This->driver_caps.window_space_position_support = GET_PCAP(TGSI_VS_WINDOW_SPACE_POSITION);
@@ -2955,6 +2956,7 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
          u_upload_data(This->vertex_uploader,
                        0,
                        (info.max_index + 1) * VertexStreamZeroStride, /* XXX */
+                      4,
                        vtxbuf.user_buffer,
                        &vtxbuf.buffer_offset,
                        &vtxbuf.buffer);
@@ -3027,6 +3029,7 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
                        base,
                        (info.max_index -
                         info.min_index + 1) * VertexStreamZeroStride, /* XXX */
+                      4,
                        (const uint8_t *)vbuf.user_buffer + base,
                        &vbuf.buffer_offset,
                        &vbuf.buffer);
@@ -3039,6 +3042,7 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
          u_upload_data(This->index_uploader,
                        0,
                        info.count * ibuf.index_size,
+                      4,
                        ibuf.user_buffer,
                        &ibuf.offset,
                        &ibuf.buffer);
diff --git a/src/gallium/state_trackers/nine/device9.h b/src/gallium/state_trackers/nine/device9.h

index 98d9c4d..cbc1e61 100644 (file)
--- a/src/gallium/state_trackers/nine/device9.h
+++ b/src/gallium/state_trackers/nine/device9.h
@@ -128,6 +128,7 @@ struct NineDevice9
      struct u_upload_mgr *vertex_uploader;
      struct u_upload_mgr *index_uploader;
      struct u_upload_mgr *constbuf_uploader;
+    unsigned constbuf_alignment;
  
      struct nine_range_pool range_pool;
  
diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c

index fe8933b..0feaeab 100644 (file)
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -1391,7 +1391,15 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
      /* Fog.
       */
      if (key->fog_mode) {
-        struct ureg_src vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0, TGSI_INTERPOLATE_LINEAR);
+        struct ureg_src vPos;
+        if (device->screen->get_param(device->screen,
+                                      PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
+            vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+        } else {
+            vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                      TGSI_INTERPOLATE_LINEAR);
+        }
+
          struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
          if (key->fog_mode == D3DFOG_EXP) {
              ureg_MUL(ureg, rFog, _ZZZZ(vPos), _ZZZZ(_CONST(22)));
@@ -1866,6 +1874,7 @@ nine_ff_update(struct NineDevice9 *device)
              u_upload_data(device->constbuf_uploader,
                            0,
                            cb.buffer_size,
+                          device->constbuf_alignment,
                            cb.user_buffer,
                            &cb.buffer_offset,
                            &cb.buffer);
@@ -1888,6 +1897,7 @@ nine_ff_update(struct NineDevice9 *device)
              u_upload_data(device->constbuf_uploader,
                            0,
                            cb.buffer_size,
+                          device->constbuf_alignment,
                            cb.user_buffer,
                            &cb.buffer_offset,
                            &cb.buffer);
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c

index 28f2787..ed43173 100644 (file)
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -462,6 +462,8 @@ struct shader_translator
      boolean lower_preds;
      boolean want_texcoord;
      boolean shift_wpos;
+    boolean wpos_is_sysval;
+    boolean face_is_sysval_integer;
      unsigned texcoord_sn;
  
      struct sm1_instruction insn; /* current instruction */
@@ -945,10 +947,16 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
      case D3DSPR_MISCTYPE:
          switch (param->idx) {
          case D3DSMO_POSITION:
-           if (ureg_src_is_undef(tx->regs.vPos))
-               tx->regs.vPos = ureg_DECL_fs_input(ureg,
-                                                  TGSI_SEMANTIC_POSITION, 0,
-                                                  TGSI_INTERPOLATE_LINEAR);
+           if (ureg_src_is_undef(tx->regs.vPos)) {
+              if (tx->wpos_is_sysval) {
+                  tx->regs.vPos =
+                      ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+              } else {
+                  tx->regs.vPos =
+                      ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
+              }
+           }
             if (tx->shift_wpos) {
                 /* TODO: do this only once */
                 struct ureg_dst wpos = tx_scratch(tx);
@@ -961,9 +969,20 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
             break;
          case D3DSMO_FACE:
             if (ureg_src_is_undef(tx->regs.vFace)) {
-               tx->regs.vFace = ureg_DECL_fs_input(ureg,
-                                                   TGSI_SEMANTIC_FACE, 0,
-                                                   TGSI_INTERPOLATE_CONSTANT);
+               if (tx->face_is_sysval_integer) {
+                   tmp = tx_scratch(tx);
+                   tx->regs.vFace =
+                       ureg_DECL_system_value(ureg, TGSI_SEMANTIC_FACE, 0);
+
+                   /* convert bool to float */
+                   ureg_UCMP(ureg, tmp, ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X),
+                             ureg_imm1f(ureg, 1), ureg_imm1f(ureg, -1));
+                   tx->regs.vFace = ureg_src(tmp);
+               } else {
+                   tx->regs.vFace = ureg_DECL_fs_input(ureg,
+                                                       TGSI_SEMANTIC_FACE, 0,
+                                                       TGSI_INTERPOLATE_CONSTANT);
+               }
                 tx->regs.vFace = ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X);
             }
             src = tx->regs.vFace;
@@ -3259,10 +3278,15 @@ shader_add_ps_fog_stage(struct shader_translator *tx, struct ureg_src src_col)
          return;
      }
  
-    if (tx->info->fog_mode != D3DFOG_NONE)
-        depth = ureg_scalar(ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
-                                              TGSI_INTERPOLATE_LINEAR),
-                                              TGSI_SWIZZLE_Z);
+    if (tx->info->fog_mode != D3DFOG_NONE) {
+        if (tx->wpos_is_sysval) {
+            depth = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
+        } else {
+            depth = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                       TGSI_INTERPOLATE_LINEAR);
+        }
+        depth = ureg_scalar(depth, TGSI_SWIZZLE_Z);
+    }
  
      nine_info_mark_const_f_used(tx->info, 33);
      fog_color = NINE_CONSTANT_SRC(32);
@@ -3344,6 +3368,8 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
      tx->shift_wpos = !GET_CAP(TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
      tx->texcoord_sn = tx->want_texcoord ?
          TGSI_SEMANTIC_TEXCOORD : TGSI_SEMANTIC_GENERIC;
+    tx->wpos_is_sysval = GET_CAP(TGSI_FS_POSITION_IS_SYSVAL);
+    tx->face_is_sysval_integer = GET_CAP(TGSI_FS_FACE_IS_INTEGER_SYSVAL);
  
      if (IS_VS) {
          tx->num_constf_allowed = NINE_MAX_CONST_F;
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c

index 558d07a..aee3162 100644 (file)
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -260,6 +260,7 @@ prepare_vs_constants_userbuf(struct NineDevice9 *device)
          u_upload_data(device->constbuf_uploader,
                        0,
                        cb.buffer_size,
+                      device->constbuf_alignment,
                        cb.user_buffer,
                        &cb.buffer_offset,
                        &cb.buffer);
@@ -336,6 +337,7 @@ prepare_ps_constants_userbuf(struct NineDevice9 *device)
          u_upload_data(device->constbuf_uploader,
                        0,
                        cb.buffer_size,
+                      device->constbuf_alignment,
                        cb.user_buffer,
                        &cb.buffer_offset,
                        &cb.buffer);
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c

index 3f5be26..3b1a7a4 100644 (file)
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -790,7 +790,7 @@ NineSwapChain9_Present( struct NineSwapChain9 *This,
          case D3DSWAPEFFECT_FLIP:
              UNTESTED(4);
          case D3DSWAPEFFECT_DISCARD:
-            /* rotate the queue */;
+            /* rotate the queue */
              pipe_resource_reference(&res, This->buffers[0]->base.resource);
              for (i = 1; i <= This->params.BackBufferCount; i++) {
                  NineSurface9_SetResourceResize(This->buffers[i - 1],
diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c

index da9ca10..afcbd97 100644 (file)
--- a/src/gallium/state_trackers/omx/entrypoint.c
+++ b/src/gallium/state_trackers/omx/entrypoint.c
@@ -137,7 +137,7 @@ OMX_ERRORTYPE omx_workaround_Destructor(OMX_COMPONENTTYPE *comp)
     priv->state = OMX_StateInvalid;
     tsem_up(priv->messageSem);
  
-   /* wait for thread to exit */;
+   /* wait for thread to exit */
     pthread_join(priv->messageHandlerThread, NULL);
  
     return omx_base_component_Destructor(comp);
diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c

index aa45089..df22a97 100644 (file)
--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -869,6 +869,9 @@ static void enc_ReleaseTasks(struct list_head *head)
  {
     struct encode_task *i, *next;
  
+   if (!head)
+          return;
+
     LIST_FOR_EACH_ENTRY_SAFE(i, next, head, list) {
        pipe_resource_reference(&i->bitstream, NULL);
        i->buf->destroy(i->buf);
diff --git a/src/gallium/state_trackers/osmesa/osmesa.c b/src/gallium/state_trackers/osmesa/osmesa.c

index 0f27ba8..38d684b 100644 (file)
--- a/src/gallium/state_trackers/osmesa/osmesa.c
+++ b/src/gallium/state_trackers/osmesa/osmesa.c
@@ -544,11 +544,39 @@ GLAPI OSMesaContext GLAPIENTRY
  OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits,
                         GLint accumBits, OSMesaContext sharelist)
  {
+   int attribs[100], n = 0;
+
+   attribs[n++] = OSMESA_FORMAT;
+   attribs[n++] = format;
+   attribs[n++] = OSMESA_DEPTH_BITS;
+   attribs[n++] = depthBits;
+   attribs[n++] = OSMESA_STENCIL_BITS;
+   attribs[n++] = stencilBits;
+   attribs[n++] = OSMESA_ACCUM_BITS;
+   attribs[n++] = accumBits;
+   attribs[n++] = 0;
+
+   return OSMesaCreateContextAttribs(attribs, sharelist);
+}
+
+
+/**
+ * New in Mesa 11.2
+ *
+ * Create context with attribute list.
+ */
+GLAPI OSMesaContext GLAPIENTRY
+OSMesaCreateContextAttribs(const int *attribList, OSMesaContext sharelist)
+{
     OSMesaContext osmesa;
     struct st_context_iface *st_shared;
     enum st_context_error st_error = 0;
     struct st_context_attribs attribs;
     struct st_api *stapi = get_st_api();
+   GLenum format = GL_RGBA;
+   int depthBits = 0, stencilBits = 0, accumBits = 0;
+   int profile = OSMESA_COMPAT_PROFILE, version_major = 1, version_minor = 0;
+   int i;
  
     if (sharelist) {
        st_shared = sharelist->stctx;
@@ -557,6 +585,64 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits,
        st_shared = NULL;
     }
  
+   for (i = 0; attribList[i]; i += 2) {
+      switch (attribList[i]) {
+      case OSMESA_FORMAT:
+         format = attribList[i+1];
+         switch (format) {
+         case OSMESA_COLOR_INDEX:
+         case OSMESA_RGBA:
+         case OSMESA_BGRA:
+         case OSMESA_ARGB:
+         case OSMESA_RGB:
+         case OSMESA_BGR:
+         case OSMESA_RGB_565:
+            /* legal */
+            break;
+         default:
+            return NULL;
+         }
+         break;
+      case OSMESA_DEPTH_BITS:
+         depthBits = attribList[i+1];
+         if (depthBits < 0)
+            return NULL;
+         break;
+      case OSMESA_STENCIL_BITS:
+         stencilBits = attribList[i+1];
+         if (stencilBits < 0)
+            return NULL;
+         break;
+      case OSMESA_ACCUM_BITS:
+         accumBits = attribList[i+1];
+         if (accumBits < 0)
+            return NULL;
+         break;
+      case OSMESA_PROFILE:
+         profile = attribList[i+1];
+         if (profile != OSMESA_CORE_PROFILE &&
+             profile != OSMESA_COMPAT_PROFILE)
+            return NULL;
+         break;
+      case OSMESA_CONTEXT_MAJOR_VERSION:
+         version_major = attribList[i+1];
+         if (version_major < 1)
+            return NULL;
+         break;
+      case OSMESA_CONTEXT_MINOR_VERSION:
+         version_minor = attribList[i+1];
+         if (version_minor < 0)
+            return NULL;
+         break;
+      case 0:
+         /* end of list */
+         break;
+      default:
+         fprintf(stderr, "Bad attribute in OSMesaCreateContextAttribs()\n");
+         return NULL;
+      }
+   }
+
     osmesa = (OSMesaContext) CALLOC_STRUCT(osmesa_context);
     if (!osmesa)
        return NULL;
@@ -581,9 +667,11 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits,
     /*
      * Create the rendering context
      */
-   attribs.profile = ST_PROFILE_DEFAULT;
-   attribs.major = 2;
-   attribs.minor = 1;
+   memset(&attribs, 0, sizeof(attribs));
+   attribs.profile = (profile == OSMESA_CORE_PROFILE)
+      ? ST_PROFILE_OPENGL_CORE : ST_PROFILE_DEFAULT;
+   attribs.major = version_major;
+   attribs.minor = version_minor;
     attribs.flags = 0;  /* ST_CONTEXT_FLAG_x */
     attribs.options.force_glsl_extensions_warn = FALSE;
     attribs.options.disable_blend_func_extended = FALSE;
@@ -614,6 +702,7 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits,
  }
  
  
+
  /**
   * Destroy an Off-Screen Mesa rendering context.
   *
@@ -883,6 +972,7 @@ struct name_function
  static struct name_function functions[] = {
     { "OSMesaCreateContext", (OSMESAproc) OSMesaCreateContext },
     { "OSMesaCreateContextExt", (OSMESAproc) OSMesaCreateContextExt },
+   { "OSMesaCreateContextAttribs", (OSMESAproc) OSMesaCreateContextAttribs },
     { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
     { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
     { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c

index 769305e..c2c24d6 100644 (file)
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -40,6 +40,7 @@ vlVaCreateBuffer(VADriverContextP ctx, VAContextID context, VABufferType type,
                   unsigned int size, unsigned int num_elements, void *data,
                   VABufferID *buf_id)
  {
+   vlVaDriver *drv;
     vlVaBuffer *buf;
  
     if (!ctx)
@@ -62,7 +63,10 @@ vlVaCreateBuffer(VADriverContextP ctx, VAContextID context, VABufferType type,
     if (data)
        memcpy(buf->data, data, size * num_elements);
  
-   *buf_id = handle_table_add(VL_VA_DRIVER(ctx)->htab, buf);
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+   *buf_id = handle_table_add(drv->htab, buf);
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -71,12 +75,16 @@ VAStatus
  vlVaBufferSetNumElements(VADriverContextP ctx, VABufferID buf_id,
                           unsigned int num_elements)
  {
+   vlVaDriver *drv;
     vlVaBuffer *buf;
  
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
-   buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+   buf = handle_table_get(drv->htab, buf_id);
+   pipe_mutex_unlock(drv->mutex);
     if (!buf)
        return VA_STATUS_ERROR_INVALID_BUFFER;
  
@@ -109,22 +117,24 @@ vlVaMapBuffer(VADriverContextP ctx, VABufferID buf_id, void **pbuff)
     if (!pbuff)
        return VA_STATUS_ERROR_INVALID_PARAMETER;
  
+   pipe_mutex_lock(drv->mutex);
     buf = handle_table_get(drv->htab, buf_id);
-   if (!buf)
-      return VA_STATUS_ERROR_INVALID_BUFFER;
-
-   if (buf->export_refcount > 0)
+   if (!buf || buf->export_refcount > 0) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_BUFFER;
+   }
  
     if (buf->derived_surface.resource) {
        *pbuff = pipe_buffer_map(drv->pipe, buf->derived_surface.resource,
                                 PIPE_TRANSFER_WRITE,
                                 &buf->derived_surface.transfer);
+      pipe_mutex_unlock(drv->mutex);
  
        if (!buf->derived_surface.transfer || !*pbuff)
           return VA_STATUS_ERROR_INVALID_BUFFER;
  
     } else {
+      pipe_mutex_unlock(drv->mutex);
        *pbuff = buf->data;
     }
  
@@ -144,20 +154,23 @@ vlVaUnmapBuffer(VADriverContextP ctx, VABufferID buf_id)
     if (!drv)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
+   pipe_mutex_lock(drv->mutex);
     buf = handle_table_get(drv->htab, buf_id);
-   if (!buf)
-      return VA_STATUS_ERROR_INVALID_BUFFER;
-
-   if (buf->export_refcount > 0)
+   if (!buf || buf->export_refcount > 0) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_BUFFER;
+   }
  
     if (buf->derived_surface.resource) {
-      if (!buf->derived_surface.transfer)
+      if (!buf->derived_surface.transfer) {
+         pipe_mutex_unlock(drv->mutex);
           return VA_STATUS_ERROR_INVALID_BUFFER;
+      }
  
        pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer);
        buf->derived_surface.transfer = NULL;
     }
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -165,18 +178,25 @@ vlVaUnmapBuffer(VADriverContextP ctx, VABufferID buf_id)
  VAStatus
  vlVaDestroyBuffer(VADriverContextP ctx, VABufferID buf_id)
  {
+   vlVaDriver *drv;
     vlVaBuffer *buf;
  
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
-   buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
-   if (!buf)
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+   buf = handle_table_get(drv->htab, buf_id);
+   if (!buf) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_BUFFER;
+   }
  
     if (buf->derived_surface.resource) {
-      if (buf->export_refcount > 0)
+      if (buf->export_refcount > 0) {
+         pipe_mutex_unlock(drv->mutex);
           return VA_STATUS_ERROR_INVALID_BUFFER;
+      }
  
        pipe_resource_reference(&buf->derived_surface.resource, NULL);
     }
@@ -184,6 +204,7 @@ vlVaDestroyBuffer(VADriverContextP ctx, VABufferID buf_id)
     FREE(buf->data);
     FREE(buf);
     handle_table_remove(VL_VA_DRIVER(ctx)->htab, buf_id);
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -192,12 +213,16 @@ VAStatus
  vlVaBufferInfo(VADriverContextP ctx, VABufferID buf_id, VABufferType *type,
                 unsigned int *size, unsigned int *num_elements)
  {
+   vlVaDriver *drv;
     vlVaBuffer *buf;
  
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
-   buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+   buf = handle_table_get(drv->htab, buf_id);
+   pipe_mutex_unlock(drv->mutex);
     if (!buf)
        return VA_STATUS_ERROR_INVALID_BUFFER;
  
@@ -212,6 +237,7 @@ VAStatus
  vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
                          VABufferInfo *out_buf_info)
  {
+   vlVaDriver *drv;
     uint32_t i;
     uint32_t mem_type;
     vlVaBuffer *buf ;
@@ -226,7 +252,11 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
+   drv = VL_VA_DRIVER(ctx);
+   screen = VL_VA_PSCREEN(ctx);
+   pipe_mutex_lock(drv->mutex);
     buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+   pipe_mutex_unlock(drv->mutex);
  
     if (!buf)
        return VA_STATUS_ERROR_INVALID_BUFFER;
@@ -255,13 +285,6 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
     if (!buf->derived_surface.resource)
        return VA_STATUS_ERROR_INVALID_BUFFER;
  
-   screen = VL_VA_PSCREEN(ctx);
-
-   if (buf->derived_surface.fence) {
-      screen->fence_finish(screen, buf->derived_surface.fence, PIPE_TIMEOUT_INFINITE);
-      screen->fence_reference(screen, &buf->derived_surface.fence, NULL);
-   }
-
     if (buf->export_refcount > 0) {
        if (buf->export_state.mem_type != mem_type)
           return VA_STATUS_ERROR_INVALID_PARAMETER;
@@ -272,6 +295,10 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
        case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: {
           struct winsys_handle whandle;
  
+         pipe_mutex_lock(drv->mutex);
+         drv->pipe->flush(drv->pipe, NULL, 0);
+         pipe_mutex_unlock(drv->mutex);
+
           memset(&whandle, 0, sizeof(whandle));
           whandle.type = DRM_API_HANDLE_TYPE_FD;
  
@@ -300,12 +327,16 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
  VAStatus
  vlVaReleaseBufferHandle(VADriverContextP ctx, VABufferID buf_id)
  {
+   vlVaDriver *drv;
     vlVaBuffer *buf;
  
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
-   buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+   buf = handle_table_get(drv->htab, buf_id);
+   pipe_mutex_unlock(drv->mutex);
  
     if (!buf)
        return VA_STATUS_ERROR_INVALID_BUFFER;
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c

index 192794f..37a0117 100644 (file)
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -155,6 +155,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
  
     vl_csc_get_matrix(VL_CSC_COLOR_STANDARD_BT_601, NULL, true, &drv->csc);
     vl_compositor_set_csc_matrix(&drv->cstate, (const vl_csc_matrix *)&drv->csc);
+   pipe_mutex_init(drv->mutex);
  
     ctx->pDriverData = (void *)drv;
     ctx->version_major = 0;
@@ -262,7 +263,9 @@ vlVaCreateContext(VADriverContextP ctx, VAConfigID config_id, int picture_width,
     }
  
     context->desc.base.profile = config_id;
+   pipe_mutex_lock(drv->mutex);
     *context_id = handle_table_add(drv->htab, context);
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -277,6 +280,7 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
     drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
     context = handle_table_get(drv->htab, context_id);
  
     if (context->decoder) {
@@ -294,6 +298,7 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id)
     }
     FREE(context);
     handle_table_remove(drv->htab, context_id);
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -312,6 +317,7 @@ vlVaTerminate(VADriverContextP ctx)
     drv->pipe->destroy(drv->pipe);
     drv->vscreen->destroy(drv->vscreen);
     handle_table_destroy(drv->htab);
+   pipe_mutex_destroy(drv->mutex);
     FREE(drv);
  
     return VA_STATUS_SUCCESS;
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c

index ae07da8..2c42a98 100644 (file)
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -34,6 +34,7 @@
  #include "util/u_video.h"
  
  #include "vl/vl_winsys.h"
+#include "vl/vl_video_buffer.h"
  
  #include "va_private.h"
  
@@ -61,15 +62,9 @@ vlVaVideoSurfaceSize(vlVaSurface *p_surf, int component,
     *width = p_surf->templat.width;
     *height = p_surf->templat.height;
  
-   if (component > 0) {
-      if (p_surf->templat.chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) {
-         *width /= 2;
-         *height /= 2;
-      } else if (p_surf->templat.chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422)
-         *width /= 2;
-   }
-   if (p_surf->templat.interlaced)
-      *height /= 2;
+   vl_video_buffer_adjust_size(width, height, component,
+                               p_surf->templat.chroma_format,
+                               p_surf->templat.interlaced);
  }
  
  VAStatus
@@ -119,7 +114,9 @@ vlVaCreateImage(VADriverContextP ctx, VAImageFormat *format, int width, int heig
     img = CALLOC(1, sizeof(VAImage));
     if (!img)
        return VA_STATUS_ERROR_ALLOCATION_FAILED;
+   pipe_mutex_lock(drv->mutex);
     img->image_id = handle_table_add(drv->htab, img);
+   pipe_mutex_unlock(drv->mutex);
  
     img->format = *format;
     img->width = width;
@@ -261,16 +258,17 @@ vlVaDeriveImage(VADriverContextP ctx, VASurfaceID surface, VAImage *image)
        return VA_STATUS_ERROR_ALLOCATION_FAILED;
     }
  
+   pipe_mutex_lock(drv->mutex);
     img->image_id = handle_table_add(drv->htab, img);
  
     img_buf->type = VAImageBufferType;
-   img_buf->size = image->data_size;
+   img_buf->size = img->data_size;
     img_buf->num_elements = 1;
-   img_buf->derived_surface.fence = surf->fence;
  
     pipe_resource_reference(&img_buf->derived_surface.resource, surfaces[0]->texture);
  
     img->buf = handle_table_add(VL_VA_DRIVER(ctx)->htab, img_buf);
+   pipe_mutex_unlock(drv->mutex);
  
     *image = *img;
  
@@ -280,16 +278,22 @@ vlVaDeriveImage(VADriverContextP ctx, VASurfaceID surface, VAImage *image)
  VAStatus
  vlVaDestroyImage(VADriverContextP ctx, VAImageID image)
  {
+   vlVaDriver *drv;
     VAImage  *vaimage;
  
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
-   vaimage = handle_table_get(VL_VA_DRIVER(ctx)->htab, image);
-   if (!vaimage)
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+   vaimage = handle_table_get(drv->htab, image);
+   if (!vaimage) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_IMAGE;
+   }
  
     handle_table_remove(VL_VA_DRIVER(ctx)->htab, image);
+   pipe_mutex_unlock(drv->mutex);
     FREE(vaimage);
     return vlVaDestroyBuffer(ctx, vaimage->buf);
  }
@@ -322,21 +326,30 @@ vlVaGetImage(VADriverContextP ctx, VASurfaceID surface, int x, int y,
  
     drv = VL_VA_DRIVER(ctx);
  
+   pipe_mutex_lock(drv->mutex);
     surf = handle_table_get(drv->htab, surface);
-   if (!surf || !surf->buffer)
+   if (!surf || !surf->buffer) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_SURFACE;
+   }
  
     vaimage = handle_table_get(drv->htab, image);
-   if (!vaimage)
+   if (!vaimage) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_IMAGE;
+   }
  
     img_buf = handle_table_get(drv->htab, vaimage->buf);
-   if (!img_buf)
+   if (!img_buf) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_BUFFER;
+   }
  
     format = VaFourccToPipeFormat(vaimage->format.fourcc);
-   if (format == PIPE_FORMAT_NONE)
+   if (format == PIPE_FORMAT_NONE) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_OPERATION_FAILED;
+   }
  
     if (format != surf->buffer->buffer_format) {
        /* support NV12 to YV12 and IYUV conversion now only */
@@ -345,13 +358,17 @@ vlVaGetImage(VADriverContextP ctx, VASurfaceID surface, int x, int y,
            (format == PIPE_FORMAT_IYUV &&
            surf->buffer->buffer_format == PIPE_FORMAT_NV12))
           convert = true;
-      else
+      else {
+         pipe_mutex_unlock(drv->mutex);
           return VA_STATUS_ERROR_OPERATION_FAILED;
+      }
     }
  
     views = surf->buffer->get_sampler_view_planes(surf->buffer);
-   if (!views)
+   if (!views) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_OPERATION_FAILED;
+   }
  
     for (i = 0; i < vaimage->num_planes; i++) {
        data[i] = img_buf->data + vaimage->offsets[i];
@@ -378,8 +395,10 @@ vlVaGetImage(VADriverContextP ctx, VASurfaceID surface, int x, int y,
           uint8_t *map;
           map = drv->pipe->transfer_map(drv->pipe, views[i]->texture, 0,
                    PIPE_TRANSFER_READ, &box, &transfer);
-         if (!map)
+         if (!map) {
+            pipe_mutex_unlock(drv->mutex);
              return VA_STATUS_ERROR_OPERATION_FAILED;
+         }
  
           if (i == 1 && convert) {
              u_copy_nv12_to_yv12(data, pitches, i, j,
@@ -394,6 +413,7 @@ vlVaGetImage(VADriverContextP ctx, VASurfaceID surface, int x, int y,
           pipe_transfer_unmap(drv->pipe, transfer);
        }
     }
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -416,28 +436,38 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image,
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
     drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
  
     surf = handle_table_get(drv->htab, surface);
-   if (!surf || !surf->buffer)
+   if (!surf || !surf->buffer) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_SURFACE;
+   }
  
     vaimage = handle_table_get(drv->htab, image);
-   if (!vaimage)
+   if (!vaimage) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_IMAGE;
+   }
  
     img_buf = handle_table_get(drv->htab, vaimage->buf);
-   if (!img_buf)
+   if (!img_buf) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_BUFFER;
+   }
  
     if (img_buf->derived_surface.resource) {
        /* Attempting to transfer derived image to surface */
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_UNIMPLEMENTED;
     }
  
     format = VaFourccToPipeFormat(vaimage->format.fourcc);
  
-   if (format == PIPE_FORMAT_NONE)
+   if (format == PIPE_FORMAT_NONE) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_OPERATION_FAILED;
+   }
  
     if (format != surf->buffer->buffer_format) {
        struct pipe_video_buffer *tmp_buf;
@@ -448,6 +478,7 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image,
  
        if (!tmp_buf) {
           surf->templat.buffer_format = old_surf_format;
+         pipe_mutex_unlock(drv->mutex);
           return VA_STATUS_ERROR_ALLOCATION_FAILED;
        }
  
@@ -456,8 +487,10 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image,
     }
  
     views = surf->buffer->get_sampler_view_planes(surf->buffer);
-   if (!views)
+   if (!views) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_OPERATION_FAILED;
+   }
  
     for (i = 0; i < vaimage->num_planes; i++) {
        data[i] = img_buf->data + vaimage->offsets[i];
@@ -486,6 +519,7 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image,
              pitches[i] * views[i]->texture->array_size, 0);
        }
     }
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c

index 7b30bf8..89ac024 100644 (file)
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -50,24 +50,29 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende
     if (!drv)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
+   pipe_mutex_lock(drv->mutex);
     context = handle_table_get(drv->htab, context_id);
-   if (!context)
+   if (!context) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_CONTEXT;
+   }
  
     surf = handle_table_get(drv->htab, render_target);
+   pipe_mutex_unlock(drv->mutex);
     if (!surf || !surf->buffer)
        return VA_STATUS_ERROR_INVALID_SURFACE;
  
     context->target = surf->buffer;
  
     if (!context->decoder) {
+
        /* VPP */
        if (context->templat.profile == PIPE_VIDEO_PROFILE_UNKNOWN &&
-         ((context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM  &&
-           context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM  &&
-           context->target->buffer_format != PIPE_FORMAT_B8G8R8X8_UNORM  &&
-           context->target->buffer_format != PIPE_FORMAT_R8G8B8X8_UNORM) ||
-           context->target->interlaced))
+          context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM &&
+          context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM &&
+          context->target->buffer_format != PIPE_FORMAT_B8G8R8X8_UNORM &&
+          context->target->buffer_format != PIPE_FORMAT_R8G8B8X8_UNORM &&
+          context->target->buffer_format != PIPE_FORMAT_NV12)
           return VA_STATUS_ERROR_UNIMPLEMENTED;
  
        return VA_STATUS_SUCCESS;
@@ -174,6 +179,14 @@ static void
  handleSliceParameterBuffer(vlVaContext *context, vlVaBuffer *buf)
  {
     switch (u_reduce_video_profile(context->templat.profile)) {
+   case PIPE_VIDEO_FORMAT_MPEG12:
+      vlVaHandleSliceParameterBufferMPEG12(context, buf);
+      break;
+
+   case PIPE_VIDEO_FORMAT_VC1:
+      vlVaHandleSliceParameterBufferVC1(context, buf);
+      break;
+
     case PIPE_VIDEO_FORMAT_MPEG4_AVC:
        vlVaHandleSliceParameterBufferH264(context, buf);
        break;
@@ -281,14 +294,19 @@ vlVaRenderPicture(VADriverContextP ctx, VAContextID context_id, VABufferID *buff
     if (!drv)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
+   pipe_mutex_lock(drv->mutex);
     context = handle_table_get(drv->htab, context_id);
-   if (!context)
+   if (!context) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_CONTEXT;
+   }
  
     for (i = 0; i < num_buffers; ++i) {
        vlVaBuffer *buf = handle_table_get(drv->htab, buffers[i]);
-      if (!buf)
+      if (!buf) {
+         pipe_mutex_unlock(drv->mutex);
           return VA_STATUS_ERROR_INVALID_BUFFER;
+      }
  
        switch (buf->type) {
        case VAPictureParameterBufferType:
@@ -314,6 +332,7 @@ vlVaRenderPicture(VADriverContextP ctx, VAContextID context_id, VABufferID *buff
           break;
        }
     }
+   pipe_mutex_unlock(drv->mutex);
  
     return vaStatus;
  }
@@ -331,7 +350,9 @@ vlVaEndPicture(VADriverContextP ctx, VAContextID context_id)
     if (!drv)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
+   pipe_mutex_lock(drv->mutex);
     context = handle_table_get(drv->htab, context_id);
+   pipe_mutex_unlock(drv->mutex);
     if (!context)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
diff --git a/src/gallium/state_trackers/va/picture_h264.c b/src/gallium/state_trackers/va/picture_h264.c

index acbfe5d..883a94a 100644 (file)
--- a/src/gallium/state_trackers/va/picture_h264.c
+++ b/src/gallium/state_trackers/va/picture_h264.c
@@ -48,6 +48,7 @@ void vlVaHandlePictureParameterBufferH264(vlVaDriver *drv, vlVaContext *context,
     unsigned i;
  
     assert(buf->size >= sizeof(VAPictureParameterBufferH264) && buf->num_elements == 1);
+   context->desc.h264.slice_count = 0;
     /*CurrPic*/
     context->desc.h264.field_order_cnt[0] = h264->CurrPic.TopFieldOrderCnt;
     context->desc.h264.field_order_cnt[1] = h264->CurrPic.BottomFieldOrderCnt;
@@ -162,6 +163,7 @@ void vlVaHandleSliceParameterBufferH264(vlVaContext *context, vlVaBuffer *buf)
     VASliceParameterBufferH264 *h264 = buf->data;
  
     assert(buf->size >= sizeof(VASliceParameterBufferH264) && buf->num_elements == 1);
+   context->desc.h264.slice_count += buf->num_elements;
     context->desc.h264.num_ref_idx_l0_active_minus1 =
        h264->num_ref_idx_l0_active_minus1;
     context->desc.h264.num_ref_idx_l1_active_minus1 =
diff --git a/src/gallium/state_trackers/va/picture_hevc.c b/src/gallium/state_trackers/va/picture_hevc.c

index dc66b0f..28743ee 100644 (file)
--- a/src/gallium/state_trackers/va/picture_hevc.c
+++ b/src/gallium/state_trackers/va/picture_hevc.c
@@ -159,11 +159,6 @@ void vlVaHandlePictureParameterBufferHEVC(vlVaDriver *drv, vlVaContext *context,
     for (i = 0 ; i < 15 ; i++) {
        context->desc.h265.PicOrderCntVal[i] = hevc->ReferenceFrames[i].pic_order_cnt;
  
-      unsigned int index = hevc->ReferenceFrames[i].picture_id & 0x7F;
-
-      if (index == 0x7F)
-         continue;
-
        vlVaGetReferenceFrame(drv, hevc->ReferenceFrames[i].picture_id, &context->desc.h265.ref[i]);
  
        if ((hevc->ReferenceFrames[i].flags & VA_PICTURE_HEVC_RPS_ST_CURR_BEFORE) && (iBefore < 8)) {
diff --git a/src/gallium/state_trackers/va/picture_mpeg12.c b/src/gallium/state_trackers/va/picture_mpeg12.c

index e587b1e..812e9e5 100644 (file)
--- a/src/gallium/state_trackers/va/picture_mpeg12.c
+++ b/src/gallium/state_trackers/va/picture_mpeg12.c
@@ -32,6 +32,7 @@ void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *contex
     VAPictureParameterBufferMPEG2 *mpeg2 = buf->data;
  
     assert(buf->size >= sizeof(VAPictureParameterBufferMPEG2) && buf->num_elements == 1);
+   context->desc.mpeg12.num_slices = 0;
     /*horizontal_size;*/
     /*vertical_size;*/
     vlVaGetReferenceFrame(drv, mpeg2->forward_reference_picture, &context->desc.mpeg12.ref[0]);
@@ -78,3 +79,8 @@ void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf)
        context->desc.mpeg12.non_intra_matrix = NULL;
  }
  
+void vlVaHandleSliceParameterBufferMPEG12(vlVaContext *context, vlVaBuffer *buf)
+{
+   assert(buf->size >= sizeof(VASliceParameterBufferMPEG2) && buf->num_elements == 1);
+   context->desc.mpeg12.num_slices += buf->num_elements;
+}
diff --git a/src/gallium/state_trackers/va/picture_vc1.c b/src/gallium/state_trackers/va/picture_vc1.c

index f95fd83..6ad1571 100644 (file)
--- a/src/gallium/state_trackers/va/picture_vc1.c
+++ b/src/gallium/state_trackers/va/picture_vc1.c
@@ -32,6 +32,7 @@ void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context,
     VAPictureParameterBufferVC1 * vc1 = buf->data;
  
     assert(buf->size >= sizeof(VAPictureParameterBufferVC1) && buf->num_elements == 1);
+   context->desc.vc1.slice_count = 0;
     vlVaGetReferenceFrame(drv, vc1->forward_reference_picture, &context->desc.vc1.ref[0]);
     vlVaGetReferenceFrame(drv, vc1->backward_reference_picture, &context->desc.vc1.ref[1]);
     context->desc.vc1.picture_type = vc1->picture_fields.bits.picture_type;
@@ -65,3 +66,9 @@ void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context,
     context->desc.vc1.deblockEnable = vc1->post_processing != 0;
     context->desc.vc1.pquant = vc1->pic_quantizer_fields.bits.pic_quantizer_scale;
  }
+
+void vlVaHandleSliceParameterBufferVC1(vlVaContext *context, vlVaBuffer *buf)
+{
+   assert(buf->size >= sizeof(VASliceParameterBufferVC1) && buf->num_elements == 1);
+   context->desc.vc1.slice_count += buf->num_elements;
+}
diff --git a/src/gallium/state_trackers/va/postproc.c b/src/gallium/state_trackers/va/postproc.c

index 105f251..0cec0c8 100644 (file)
--- a/src/gallium/state_trackers/va/postproc.c
+++ b/src/gallium/state_trackers/va/postproc.c
@@ -27,6 +27,9 @@
  
  #include "util/u_handle_table.h"
  
+#include "vl/vl_defines.h"
+#include "vl/vl_video_buffer.h"
+
  #include "va_private.h"
  
  static const VARectangle *
@@ -44,46 +47,22 @@ vlVaRegionDefault(const VARectangle *region, struct pipe_video_buffer *buf,
     return def;
  }
  
-VAStatus
-vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
+static VAStatus
+vlVaPostProcCompositor(vlVaDriver *drv, vlVaContext *context,
+                       const VARectangle *src_region,
+                       const VARectangle *dst_region,
+                       struct pipe_video_buffer *src,
+                       struct pipe_video_buffer *dst,
+                       enum vl_compositor_deinterlace deinterlace)
  {
-   VARectangle def_src_region, def_dst_region;
-   const VARectangle *src_region, *dst_region;
+   struct pipe_surface **surfaces;
     struct u_rect src_rect;
     struct u_rect dst_rect;
-   vlVaSurface *src_surface;
-   VAProcPipelineParameterBuffer *pipeline_param;
-   struct pipe_surface **surfaces;
-   struct pipe_screen *screen;
-   struct pipe_surface *psurf;
-
-   if (!drv || !context)
-      return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-   if (!buf || !buf->data)
-      return VA_STATUS_ERROR_INVALID_BUFFER;
-
-   if (!context->target)
-      return VA_STATUS_ERROR_INVALID_SURFACE;
-
-   pipeline_param = (VAProcPipelineParameterBuffer *)buf->data;
-
-   src_surface = handle_table_get(drv->htab, pipeline_param->surface);
-   if (!src_surface || !src_surface->buffer)
-      return VA_STATUS_ERROR_INVALID_SURFACE;
-
-   surfaces = context->target->get_surfaces(context->target);
  
+   surfaces = dst->get_surfaces(dst);
     if (!surfaces || !surfaces[0])
        return VA_STATUS_ERROR_INVALID_SURFACE;
  
-   screen = drv->pipe->screen;
-
-   psurf = surfaces[0];
-
-   src_region = vlVaRegionDefault(pipeline_param->surface_region, src_surface->buffer, &def_src_region);
-   dst_region = vlVaRegionDefault(pipeline_param->output_region, context->target, &def_dst_region);
-
     src_rect.x0 = src_region->x;
     src_rect.y0 = src_region->y;
     src_rect.x1 = src_region->x + src_region->width;
@@ -95,14 +74,175 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
     dst_rect.y1 = dst_region->y + dst_region->height;
  
     vl_compositor_clear_layers(&drv->cstate);
-   vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
+   vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src,
+                                 &src_rect, NULL, deinterlace);
     vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
-   vl_compositor_render(&drv->cstate, &drv->compositor, psurf, NULL, false);
+   vl_compositor_render(&drv->cstate, &drv->compositor, surfaces[0], NULL, false);
  
-   screen->fence_reference(screen, &src_surface->fence, NULL);
-   drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
+   return VA_STATUS_SUCCESS;
+}
+
+static void vlVaGetBox(struct pipe_video_buffer *buf, unsigned idx,
+                       struct pipe_box *box, const VARectangle *region)
+{
+   unsigned plane = buf->interlaced ? idx / 2: idx;
+   unsigned x, y, width, height;
+
+   x = abs(region->x);
+   y = abs(region->y);
+   width = region->width;
+   height = region->height;
+
+   vl_video_buffer_adjust_size(&x, &y, plane, buf->chroma_format,
+                               buf->interlaced);
+   vl_video_buffer_adjust_size(&width, &height, plane, buf->chroma_format,
+                               buf->interlaced);
+
+   box->x = region->x < 0 ? -x : x;
+   box->y = region->y < 0 ? -y : y;
+   box->width = width;
+   box->height = height;
+}
+
+static VAStatus vlVaPostProcBlit(vlVaDriver *drv, vlVaContext *context,
+                                 const VARectangle *src_region,
+                                 const VARectangle *dst_region,
+                                 struct pipe_video_buffer *src,
+                                 struct pipe_video_buffer *dst,
+                                 enum vl_compositor_deinterlace deinterlace)
+{
+   struct pipe_surface **src_surfaces;
+   struct pipe_surface **dst_surfaces;
+   unsigned i;
+
+   if (src->interlaced != dst->interlaced)
+      return VA_STATUS_ERROR_INVALID_SURFACE;
+
+   src_surfaces = src->get_surfaces(src);
+   if (!src_surfaces || !src_surfaces[0])
+      return VA_STATUS_ERROR_INVALID_SURFACE;
+
+   dst_surfaces = dst->get_surfaces(dst);
+   if (!dst_surfaces || !dst_surfaces[0])
+      return VA_STATUS_ERROR_INVALID_SURFACE;
+
+   for (i = 0; i < VL_MAX_SURFACES; ++i) {
+      struct pipe_surface *from = src_surfaces[i];
+      struct pipe_blit_info blit;
+
+      if (src->interlaced) {
+         /* Not 100% accurate, but close enough */
+         switch (deinterlace) {
+         case VL_COMPOSITOR_BOB_TOP:
+            from = src_surfaces[i & ~1];
+            break;
+         case VL_COMPOSITOR_BOB_BOTTOM:
+            from = src_surfaces[(i & ~1) + 1];
+            break;
+         default:
+            break;
+         }
+      }
+
+      if (!from || !dst_surfaces[i])
+         continue;
+
+      memset(&blit, 0, sizeof(blit));
+      blit.src.resource = from->texture;
+      blit.src.format = from->format;
+      blit.src.level = 0;
+      blit.src.box.z = from->u.tex.first_layer;
+      blit.src.box.depth = 1;
+      vlVaGetBox(src, i, &blit.src.box, src_region);
+
+      blit.dst.resource = dst_surfaces[i]->texture;
+      blit.dst.format = dst_surfaces[i]->format;
+      blit.dst.level = 0;
+      blit.dst.box.z = dst_surfaces[i]->u.tex.first_layer;
+      blit.dst.box.depth = 1;
+      vlVaGetBox(dst, i, &blit.dst.box, dst_region);
+
+      blit.mask = PIPE_MASK_RGBA;
+      blit.filter = PIPE_TEX_MIPFILTER_LINEAR;
+
+      drv->pipe->blit(drv->pipe, &blit);
+   }
+
+   // TODO: figure out why this is necessary for DMA-buf sharing
+   drv->pipe->flush(drv->pipe, NULL, 0);
  
     return VA_STATUS_SUCCESS;
  }
  
+VAStatus
+vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
+{
+   enum vl_compositor_deinterlace deinterlace = VL_COMPOSITOR_WEAVE;
+   VARectangle def_src_region, def_dst_region;
+   const VARectangle *src_region, *dst_region;
+   VAProcPipelineParameterBuffer *param;
+   vlVaSurface *src_surface;
+   unsigned i;
  
+   if (!drv || !context)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   if (!buf || !buf->data)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   if (!context->target)
+      return VA_STATUS_ERROR_INVALID_SURFACE;
+
+   param = buf->data;
+
+   src_surface = handle_table_get(drv->htab, param->surface);
+   if (!src_surface || !src_surface->buffer)
+      return VA_STATUS_ERROR_INVALID_SURFACE;
+
+   for (i = 0; i < param->num_filters; i++) {
+      vlVaBuffer *buf = handle_table_get(drv->htab, param->filters[i]);
+      VAProcFilterParameterBufferBase *filter;
+
+      if (!buf || buf->type != VAProcFilterParameterBufferType)
+         return VA_STATUS_ERROR_INVALID_BUFFER;
+
+      filter = buf->data;
+      switch (filter->type) {
+      case VAProcFilterDeinterlacing: {
+         VAProcFilterParameterBufferDeinterlacing *deint = buf->data;
+         switch (deint->algorithm) {
+         case VAProcDeinterlacingBob:
+            if (deint->flags & VA_DEINTERLACING_BOTTOM_FIELD)
+               deinterlace = VL_COMPOSITOR_BOB_BOTTOM;
+            else
+               deinterlace = VL_COMPOSITOR_BOB_TOP;
+            break;
+
+         case VAProcDeinterlacingWeave:
+            deinterlace = VL_COMPOSITOR_WEAVE;
+            break;
+
+         default:
+            return VA_STATUS_ERROR_UNIMPLEMENTED;
+         }
+
+         break;
+      }
+
+      default:
+         return VA_STATUS_ERROR_UNIMPLEMENTED;
+      }
+   }
+
+   src_region = vlVaRegionDefault(param->surface_region, src_surface->buffer, &def_src_region);
+   dst_region = vlVaRegionDefault(param->output_region, context->target, &def_dst_region);
+
+   if (context->target->buffer_format != PIPE_FORMAT_NV12)
+      return vlVaPostProcCompositor(drv, context, src_region, dst_region,
+                                    src_surface->buffer, context->target,
+                                    deinterlace);
+   else
+      return vlVaPostProcBlit(drv, context, src_region, dst_region,
+                              src_surface->buffer, context->target,
+                              deinterlace);
+}
diff --git a/src/gallium/state_trackers/va/subpicture.c b/src/gallium/state_trackers/va/subpicture.c

index f146189..f546e56 100644 (file)
--- a/src/gallium/state_trackers/va/subpicture.c
+++ b/src/gallium/state_trackers/va/subpicture.c
@@ -65,22 +65,30 @@ VAStatus
  vlVaCreateSubpicture(VADriverContextP ctx, VAImageID image,
                       VASubpictureID *subpicture)
  {
+   vlVaDriver *drv;
     vlVaSubpicture *sub;
     VAImage *img;
  
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
-   img = handle_table_get(VL_VA_DRIVER(ctx)->htab, image);
-   if (!img)
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+   img = handle_table_get(drv->htab, image);
+   if (!img) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_IMAGE;
+   }
  
     sub = CALLOC(1, sizeof(*sub));
-   if (!sub)
+   if (!sub) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_ALLOCATION_FAILED;
+   }
  
     sub->image = img;
     *subpicture = handle_table_add(VL_VA_DRIVER(ctx)->htab, sub);
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -88,17 +96,24 @@ vlVaCreateSubpicture(VADriverContextP ctx, VAImageID image,
  VAStatus
  vlVaDestroySubpicture(VADriverContextP ctx, VASubpictureID subpicture)
  {
+   vlVaDriver *drv;
     vlVaSubpicture *sub;
  
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
-   sub = handle_table_get(VL_VA_DRIVER(ctx)->htab, subpicture);
-   if (!sub)
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+
+   sub = handle_table_get(drv->htab, subpicture);
+   if (!sub) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_SUBPICTURE;
+   }
  
     FREE(sub);
-   handle_table_remove(VL_VA_DRIVER(ctx)->htab, subpicture);
+   handle_table_remove(drv->htab, subpicture);
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -106,17 +121,24 @@ vlVaDestroySubpicture(VADriverContextP ctx, VASubpictureID subpicture)
  VAStatus
  vlVaSubpictureImage(VADriverContextP ctx, VASubpictureID subpicture, VAImageID image)
  {
+   vlVaDriver *drv;
     vlVaSubpicture *sub;
     VAImage *img;
  
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
-   img = handle_table_get(VL_VA_DRIVER(ctx)->htab, image);
-   if (!img)
+   drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
+
+   img = handle_table_get(drv->htab, image);
+   if (!img) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_IMAGE;
+   }
  
-   sub = handle_table_get(VL_VA_DRIVER(ctx)->htab, subpicture);
+   sub = handle_table_get(drv->htab, subpicture);
+   pipe_mutex_unlock(drv->mutex);
     if (!sub)
        return VA_STATUS_ERROR_INVALID_SUBPICTURE;
  
@@ -164,15 +186,20 @@ vlVaAssociateSubpicture(VADriverContextP ctx, VASubpictureID subpicture,
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
     drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
  
     sub = handle_table_get(drv->htab, subpicture);
-   if (!sub)
+   if (!sub) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_SUBPICTURE;
+   }
  
     for (i = 0; i < num_surfaces; i++) {
        surf = handle_table_get(drv->htab, target_surfaces[i]);
-      if (!surf)
+      if (!surf) {
+         pipe_mutex_unlock(drv->mutex);
           return VA_STATUS_ERROR_INVALID_SURFACE;
+      }
     }
  
     sub->src_rect = src_rect;
@@ -191,8 +218,10 @@ vlVaAssociateSubpicture(VADriverContextP ctx, VASubpictureID subpicture,
     tex_temp.flags = 0;
     if (!drv->pipe->screen->is_format_supported(
            drv->pipe->screen, tex_temp.format, tex_temp.target,
-          tex_temp.nr_samples, tex_temp.bind))
+          tex_temp.nr_samples, tex_temp.bind)) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_ALLOCATION_FAILED;
+   }
  
     tex = drv->pipe->screen->resource_create(drv->pipe->screen, &tex_temp);
  
@@ -200,13 +229,16 @@ vlVaAssociateSubpicture(VADriverContextP ctx, VASubpictureID subpicture,
     u_sampler_view_default_template(&sampler_templ, tex, tex->format);
     sub->sampler = drv->pipe->create_sampler_view(drv->pipe, tex, &sampler_templ);
     pipe_resource_reference(&tex, NULL);
-   if (!sub->sampler)
+   if (!sub->sampler) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_ALLOCATION_FAILED;
+   }
  
     for (i = 0; i < num_surfaces; i++) {
        surf = handle_table_get(drv->htab, target_surfaces[i]);
        util_dynarray_append(&surf->subpics, vlVaSubpicture *, sub);
     }
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -224,15 +256,20 @@ vlVaDeassociateSubpicture(VADriverContextP ctx, VASubpictureID subpicture,
     if (!ctx)
        return VA_STATUS_ERROR_INVALID_CONTEXT;
     drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
  
     sub = handle_table_get(drv->htab, subpicture);
-   if (!sub)
+   if (!sub) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_SUBPICTURE;
+   }
  
     for (i = 0; i < num_surfaces; i++) {
        surf = handle_table_get(drv->htab, target_surfaces[i]);
-      if (!surf)
+      if (!surf) {
+         pipe_mutex_unlock(drv->mutex);
           return VA_STATUS_ERROR_INVALID_SURFACE;
+      }
  
        array = surf->subpics.data;
        if (!array)
@@ -246,6 +283,7 @@ vlVaDeassociateSubpicture(VADriverContextP ctx, VASubpictureID subpicture,
        while (surf->subpics.size && util_dynarray_top(&surf->subpics, vlVaSubpicture *) == NULL)
           (void)util_dynarray_pop(&surf->subpics, vlVaSubpicture *);
     }
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c

index 4a18a6f..f23a889 100644 (file)
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -68,16 +68,16 @@ vlVaDestroySurfaces(VADriverContextP ctx, VASurfaceID *surface_list, int num_sur
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
     drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
     for (i = 0; i < num_surfaces; ++i) {
        vlVaSurface *surf = handle_table_get(drv->htab, surface_list[i]);
        if (surf->buffer)
           surf->buffer->destroy(surf->buffer);
-      if(surf->fence)
-         drv->pipe->screen->fence_reference(drv->pipe->screen, &surf->fence, NULL);
        util_dynarray_fini(&surf->subpics);
        FREE(surf);
        handle_table_remove(drv->htab, surface_list[i]);
     }
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -238,21 +238,21 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s
        return VA_STATUS_ERROR_INVALID_CONTEXT;
  
     drv = VL_VA_DRIVER(ctx);
+   pipe_mutex_lock(drv->mutex);
     surf = handle_table_get(drv->htab, surface_id);
-   if (!surf)
+   if (!surf) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_SURFACE;
+   }
  
     screen = drv->pipe->screen;
     vscreen = drv->vscreen;
  
-   if(surf->fence) {
-      screen->fence_finish(screen, surf->fence, PIPE_TIMEOUT_INFINITE);
-      screen->fence_reference(screen, &surf->fence, NULL);
-   }
-
     tex = vscreen->texture_from_drawable(vscreen, draw);
-   if (!tex)
+   if (!tex) {
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_DISPLAY;
+   }
  
     dirty_area = vscreen->get_dirty_area(vscreen);
  
@@ -261,6 +261,7 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s
     surf_draw = drv->pipe->create_surface(drv->pipe, tex, &surf_templ);
     if (!surf_draw) {
        pipe_resource_reference(&tex, NULL);
+      pipe_mutex_unlock(drv->mutex);
        return VA_STATUS_ERROR_INVALID_DISPLAY;
     }
  
@@ -275,17 +276,19 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s
     vl_compositor_render(&drv->cstate, &drv->compositor, surf_draw, dirty_area, true);
  
     status = vlVaPutSubpictures(surf, drv, surf_draw, dirty_area, &src_rect, &dst_rect);
-   if (status)
+   if (status) {
+      pipe_mutex_unlock(drv->mutex);
        return status;
+   }
  
     screen->flush_frontbuffer(screen, tex, 0, 0,
                               vscreen->get_private(vscreen), NULL);
  
-   screen->fence_reference(screen, &surf->fence, NULL);
-   drv->pipe->flush(drv->pipe, &surf->fence, 0);
+   drv->pipe->flush(drv->pipe, NULL, 0);
  
     pipe_resource_reference(&tex, NULL);
     pipe_surface_reference(&surf_draw, NULL);
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  }
@@ -607,6 +610,7 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format,
  
     memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID));
  
+   pipe_mutex_lock(drv->mutex);
     for (i = 0; i < num_surfaces; i++) {
        vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
        if (!surf)
@@ -635,10 +639,12 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format,
           assert(0);
        }
     }
+   pipe_mutex_unlock(drv->mutex);
  
     return VA_STATUS_SUCCESS;
  
  no_res:
+   pipe_mutex_unlock(drv->mutex);
     if (i)
        vlVaDestroySurfaces(ctx, surfaces, i);
  
@@ -657,7 +663,7 @@ vlVaQueryVideoProcFilters(VADriverContextP ctx, VAContextID context,
     if (!num_filters || !filters)
        return VA_STATUS_ERROR_INVALID_PARAMETER;
  
-   filters[num++] = VAProcFilterNone;
+   filters[num++] = VAProcFilterDeinterlacing;
  
     *num_filters = num;
  
@@ -682,8 +688,20 @@ vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context,
     switch (type) {
     case VAProcFilterNone:
        break;
+   case VAProcFilterDeinterlacing: {
+      VAProcFilterCapDeinterlacing *deint = filter_caps;
+
+      if (*num_filter_caps < 2) {
+         *num_filter_caps = 2;
+         return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
+      }
+
+      deint[i++].type = VAProcDeinterlacingBob;
+      deint[i++].type = VAProcDeinterlacingWeave;
+      break;
+   }
+
     case VAProcFilterNoiseReduction:
-   case VAProcFilterDeinterlacing:
     case VAProcFilterSharpening:
     case VAProcFilterColorBalance:
     case VAProcFilterSkinToneEnhancement:
diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h

index 6739efc..7afd81a 100644 (file)
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -44,6 +44,7 @@
  #include "vl/vl_csc.h"
  
  #include "util/u_dynarray.h"
+#include "os/os_thread.h"
  
  #define VL_VA_DRIVER(ctx) ((vlVaDriver *)ctx->pDriverData)
  #define VL_VA_PSCREEN(ctx) (VL_VA_DRIVER(ctx)->vscreen->pscreen)
@@ -203,6 +204,7 @@ typedef struct {
     struct vl_compositor compositor;
     struct vl_compositor_state cstate;
     vl_csc_matrix csc;
+   pipe_mutex mutex;
  } vlVaDriver;
  
  typedef struct {
@@ -244,7 +246,6 @@ typedef struct {
     struct {
        struct pipe_resource *resource;
        struct pipe_transfer *transfer;
-      struct pipe_fence_handle *fence;
     } derived_surface;
     unsigned int export_refcount;
     VABufferInfo export_state;
@@ -252,7 +253,6 @@ typedef struct {
  
  typedef struct {
     struct pipe_video_buffer templat, *buffer;
-   struct pipe_fence_handle *fence;
     struct util_dynarray subpics; /* vlVaSubpicture */
  } vlVaSurface;
  
@@ -353,10 +353,12 @@ VAStatus vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContex
  void vlVaGetReferenceFrame(vlVaDriver *drv, VASurfaceID surface_id, struct pipe_video_buffer **ref_frame);
  void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
  void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf);
+void vlVaHandleSliceParameterBufferMPEG12(vlVaContext *context, vlVaBuffer *buf);
  void vlVaHandlePictureParameterBufferH264(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
  void vlVaHandleIQMatrixBufferH264(vlVaContext *context, vlVaBuffer *buf);
  void vlVaHandleSliceParameterBufferH264(vlVaContext *context, vlVaBuffer *buf);
  void vlVaHandlePictureParameterBufferVC1(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
+void vlVaHandleSliceParameterBufferVC1(vlVaContext *context, vlVaBuffer *buf);
  void vlVaHandlePictureParameterBufferMPEG4(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf);
  void vlVaHandleIQMatrixBufferMPEG4(vlVaContext *context, vlVaBuffer *buf);
  void vlVaHandleSliceParameterBufferMPEG4(vlVaContext *context, vlVaBuffer *buf);
diff --git a/src/gallium/state_trackers/vdpau/mixer.c b/src/gallium/state_trackers/vdpau/mixer.c

index c0b1ecc..dec79ff 100644 (file)
--- a/src/gallium/state_trackers/vdpau/mixer.c
+++ b/src/gallium/state_trackers/vdpau/mixer.c
@@ -294,7 +294,7 @@ VdpStatus vlVdpVideoMixerRender(VdpVideoMixer mixer,
     default:
        pipe_mutex_unlock(vmixer->device->mutex);
        return VDP_STATUS_INVALID_VIDEO_MIXER_PICTURE_STRUCTURE;
-   };
+   }
  
     if (deinterlace != VL_COMPOSITOR_WEAVE && vmixer->deint.enabled &&
         video_surface_past_count > 1 && video_surface_future_count > 0) {
diff --git a/src/gallium/state_trackers/vdpau/surface.c b/src/gallium/state_trackers/vdpau/surface.c

index 55d0d76..ffcedc1 100644 (file)
--- a/src/gallium/state_trackers/vdpau/surface.c
+++ b/src/gallium/state_trackers/vdpau/surface.c
@@ -183,16 +183,9 @@ vlVdpVideoSurfaceSize(vlVdpSurface *p_surf, int component,
     *width = p_surf->templat.width;
     *height = p_surf->templat.height;
  
-   if (component > 0) {
-      if (p_surf->templat.chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) {
-         *width /= 2;
-         *height /= 2;
-      } else if (p_surf->templat.chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422) {
-         *width /= 2;
-      }
-   }
-   if (p_surf->templat.interlaced)
-      *height /= 2;
+   vl_video_buffer_adjust_size(width, height, component,
+                               p_surf->templat.chroma_format,
+                               p_surf->templat.interlaced);
  }
  
  /**
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am

index 08f95e8..f3ba1e3 100644 (file)
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/src/gallium/Automake.inc
  
  lib_LTLIBRARIES = lib@OPENCL_LIBNAME@.la
  
+AM_CPPFLAGS = \
+        $(LIBELF_CFLAGS)
+
  lib@OPENCL_LIBNAME@_la_LDFLAGS = \
         $(LLVM_LDFLAGS) \
         -no-undefined \
@@ -19,7 +22,7 @@ lib@OPENCL_LIBNAME@_la_LIBADD = \
         $(top_builddir)/src/gallium/state_trackers/clover/libclover.la \
         $(top_builddir)/src/gallium/auxiliary/libgallium.la \
         $(top_builddir)/src/util/libmesautil.la \
-       $(ELF_LIB) \
+       $(LIBELF_LIBS) \
         $(DLOPEN_LIBS) \
         -lclangCodeGen \
         -lclangFrontendTool \
diff --git a/src/gallium/targets/va/Makefile.am b/src/gallium/targets/va/Makefile.am

index 733e7ac..1edd5c2 100644 (file)
--- a/src/gallium/targets/va/Makefile.am
+++ b/src/gallium/targets/va/Makefile.am
@@ -42,6 +42,8 @@ TARGET_DRIVERS =
  TARGET_CPPFLAGS =
  TARGET_LIB_DEPS =
  
+include $(top_srcdir)/src/gallium/drivers/nouveau/Automake.inc
+
  include $(top_srcdir)/src/gallium/drivers/r600/Automake.inc
  include $(top_srcdir)/src/gallium/drivers/radeonsi/Automake.inc
  
diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c

index bcdfb11..5ce12ab 100644 (file)
--- a/src/gallium/tests/trivial/compute.c
+++ b/src/gallium/tests/trivial/compute.c
@@ -428,6 +428,35 @@ static void launch_grid(struct context *ctx, const uint *block_layout,
          pipe->launch_grid(pipe, block_layout, grid_layout, pc, input);
  }
  
+static void test_default_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xdeadbeef;
+}
+
+/* test_system_values */
+static void test_system_values_expect(void *p, int s, int x, int y)
+{
+        int id = x / 16, sv = (x % 16) / 4, c = x % 4;
+        int tid[] = { id % 20, (id % 240) / 20, id / 240, 0 };
+        int bsz[] = { 4, 3, 5, 1};
+        int gsz[] = { 5, 4, 1, 1};
+
+        switch (sv) {
+        case 0:
+                *(uint32_t *)p = tid[c] / bsz[c];
+                break;
+        case 1:
+                *(uint32_t *)p = bsz[c];
+                break;
+        case 2:
+                *(uint32_t *)p = gsz[c];
+                break;
+        case 3:
+                *(uint32_t *)p = tid[c] % bsz[c];
+                break;
+        }
+}
+
  static void test_system_values(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -461,44 +490,31 @@ static void test_system_values(struct context *ctx)
                  "  STORE RES[0].xyzw, TEMP[0], SV[3]\n"
                  "  RET\n"
                  "ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                int id = x / 16, sv = (x % 16) / 4, c = x % 4;
-                int tid[] = { id % 20, (id % 240) / 20, id / 240, 0 };
-                int bsz[] = { 4, 3, 5, 1};
-                int gsz[] = { 5, 4, 1, 1};
-
-                switch (sv) {
-                case 0:
-                        *(uint32_t *)p = tid[c] / bsz[c];
-                        break;
-                case 1:
-                        *(uint32_t *)p = bsz[c];
-                        break;
-                case 2:
-                        *(uint32_t *)p = gsz[c];
-                        break;
-                case 3:
-                        *(uint32_t *)p = tid[c] % bsz[c];
-                        break;
-                }
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 76800, 0, init);
+                 76800, 0, test_default_init);
          init_compute_resources(ctx, (int []) { 0, -1 });
          launch_grid(ctx, (uint []){4, 3, 5}, (uint []){5, 4, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_system_values_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_resource_access */
+static void test_resource_access_init0(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)x;
+}
+
+static void test_resource_access_expect(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)((x + 4 * y) & 0x3f);
+}
+
  static void test_resource_access(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -519,31 +535,33 @@ static void test_resource_access(struct context *ctx)
                  "       STORE RES[1].xyzw, TEMP[1], TEMP[0]\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init0(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)x;
-        }
-        void init1(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)((x + 4*y) & 0x3f);
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init0);
+                 256, 0, test_resource_access_init0);
          init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 60, 12, init1);
+                 60, 12, test_default_init);
          init_compute_resources(ctx, (int []) { 0, 1, -1 });
          launch_grid(ctx, (uint []){1, 1, 1}, (uint []){15, 12, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_resource_access_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_function_calls */
+static void test_function_calls_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 15 * y + x;
+}
+
+static void test_function_calls_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = (15 * y + x) < 4 ? 2 : 1 ;
+}
+
  static void test_function_calls(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -585,26 +603,26 @@ static void test_function_calls(struct context *ctx)
                  "21:  STORE RES[0].x, TEMP[2], TEMP[1].xxxx\n"
                  "22:  RET\n"
                  "23: ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 15 * y + x;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = (15 * y + x) < 4 ? 2 : 1 ;
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 15, 12, init);
+                 15, 12, test_function_calls_init);
          init_compute_resources(ctx, (int []) { 0, -1 });
          launch_grid(ctx, (uint []){3, 3, 3}, (uint []){5, 4, 1}, 15, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_function_calls_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_input_global */
+static void test_input_global_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xdeadbeef - (x == 0 ? 0x10001 + 2 * s : 0);
+}
+
  static void test_input_global(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -621,35 +639,39 @@ static void test_input_global(struct context *ctx)
                  "       STORE RGLOBAL.x, TEMP[1].yyyy, TEMP[1]\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef - (x == 0 ? 0x10001 + 2 * s : 0);
-        }
          uint32_t input[8] = { 0x10001, 0x10002, 0x10003, 0x10004,
                                0x10005, 0x10006, 0x10007, 0x10008 };
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 0, 32, src, NULL);
-        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 2, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
-        init_tex(ctx, 3, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0, init);
+        init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 2, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
+        init_tex(ctx, 3, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT, 32, 0,
+                 test_default_init);
          init_globals(ctx, (int []){ 0, 1, 2, 3, -1 },
                       (uint32_t *[]){ &input[1], &input[3],
                                       &input[5], &input[7] });
          launch_grid(ctx, (uint []){4, 1, 1}, (uint []){1, 1, 1}, 0, input);
-        check_tex(ctx, 0, expect, NULL);
-        check_tex(ctx, 1, expect, NULL);
-        check_tex(ctx, 2, expect, NULL);
-        check_tex(ctx, 3, expect, NULL);
+        check_tex(ctx, 0, test_input_global_expect, NULL);
+        check_tex(ctx, 1, test_input_global_expect, NULL);
+        check_tex(ctx, 2, test_input_global_expect, NULL);
+        check_tex(ctx, 3, test_input_global_expect, NULL);
          destroy_globals(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_private */
+static void test_private_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = (x / 32) + x % 32;
+}
+
  static void test_private(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -691,26 +713,26 @@ static void test_private(struct context *ctx)
                  "       ENDLOOP\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = (x / 32) + x % 32;
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 128, 0, src, NULL);
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 32768, 0, init);
+                 32768, 0, test_default_init);
          init_compute_resources(ctx, (int []) { 0, -1 });
          launch_grid(ctx, (uint []){16, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_private_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_local */
+static void test_local_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x & 0x20 ? 2 : 1;
+}
+
  static void test_local(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -778,26 +800,42 @@ static void test_local(struct context *ctx)
                  "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x & 0x20 ? 2 : 1;
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 256, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
          init_compute_resources(ctx, (int []) { 0, -1 });
          launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_local_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_sample */
+static void test_sample_init(void *p, int s, int x, int y)
+{
+        *(float *)p = s ? 1 : x * y;
+}
+
+static void test_sample_expect(void *p, int s, int x, int y)
+{
+        switch (x % 4) {
+        case 0:
+                *(float *)p = x / 4 * y;
+                break;
+        case 1:
+        case 2:
+                *(float *)p = 0;
+                break;
+        case 3:
+                *(float *)p = 1;
+                break;
+        }
+}
+
  static void test_sample(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -818,36 +856,19 @@ static void test_sample(struct context *ctx)
                  "       STORE RES[0].xyzw, TEMP[0], TEMP[1]\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(float *)p = s ? 1 : x * y;
-        }
-        void expect(void *p, int s, int x, int y) {
-                switch (x % 4) {
-                case 0:
-                        *(float *)p = x / 4 * y;
-                        break;
-                case 1:
-                case 2:
-                        *(float *)p = 0;
-                        break;
-                case 3:
-                        *(float *)p = 1;
-                        break;
-                }
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 128, 32, init);
+                 128, 32, test_sample_init);
          init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                 512, 32, init);
+                 512, 32, test_sample_init);
          init_compute_resources(ctx, (int []) { 1, -1 });
          init_sampler_views(ctx, (int []) { 0, -1 });
          init_sampler_states(ctx, 2);
          launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_sample_expect, NULL);
          destroy_sampler_states(ctx);
          destroy_sampler_views(ctx);
          destroy_compute_resources(ctx);
@@ -855,6 +876,12 @@ static void test_sample(struct context *ctx)
          destroy_prog(ctx);
  }
  
+/* test_many_kern */
+static void test_many_kern_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x;
+}
+
  static void test_many_kern(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -883,29 +910,34 @@ static void test_many_kern(struct context *ctx)
                  "       STORE RES[0].x, TEMP[0], IMM[0].wwww\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x;
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 16, 0, init);
+                 16, 0, test_default_init);
          init_compute_resources(ctx, (int []) { 0, -1 });
          launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
          launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 5, NULL);
          launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 10, NULL);
          launch_grid(ctx, (uint []){1, 1, 1}, (uint []){1, 1, 1}, 15, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_many_kern_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_constant */
+static void test_constant_init(void *p, int s, int x, int y)
+{
+        *(float *)p = s ? 0xdeadbeef : 8.0 - (float)x;
+}
+
+static void test_constant_expect(void *p, int s, int x, int y)
+{
+        *(float *)p = 8.0 - (float)x;
+}
+
  static void test_constant(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -922,28 +954,36 @@ static void test_constant(struct context *ctx)
                  "       STORE RES[1].x, TEMP[0], TEMP[1]\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(float *)p = s ? 0xdeadbeef : 8.0 - (float)x;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(float *)p = 8.0 - (float)x;
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_constant_init);
          init_tex(ctx, 1, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_constant_init);
          init_compute_resources(ctx, (int []) { 0, 1, -1 });
          launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
-        check_tex(ctx, 1, expect, NULL);
+        check_tex(ctx, 1, test_constant_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_resource_indirect */
+static void test_resource_indirect_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = s == 0 ? 0xdeadbeef :
+                s == 1 ? x % 2 :
+                s == 2 ? 2 * x :
+                2 * x + 1;
+}
+
+static void test_resource_indirect_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 2 * x + (x % 2 ? 1 : 0);
+}
+
  static void test_resource_indirect(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -961,35 +1001,27 @@ static void test_resource_indirect(struct context *ctx)
                  "       STORE RES[0].x, TEMP[0], TEMP[1]\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = s == 0 ? 0xdeadbeef :
-                   s == 1 ? x % 2 :
-                   s == 2 ? 2 * x :
-                   2 * x + 1;
-        }
-        void expect(void *p, int s, int x, int y) {
-           *(uint32_t *)p = 2 * x + (x % 2 ? 1 : 0);
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 0, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
          init_tex(ctx, 1, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
          init_tex(ctx, 2, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
          init_tex(ctx, 3, PIPE_BUFFER, false, PIPE_FORMAT_R32_FLOAT,
-                 256, 0, init);
+                 256, 0, test_resource_indirect_init);
          init_compute_resources(ctx, (int []) { 0, 1, 2, 3, -1 });
          launch_grid(ctx, (uint []){1, 1, 1}, (uint []){64, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_resource_indirect_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_surface_ld */
  enum pipe_format surface_fmts[] = {
          PIPE_FORMAT_B8G8R8A8_UNORM,
          PIPE_FORMAT_B8G8R8X8_UNORM,
@@ -1023,6 +1055,42 @@ enum pipe_format surface_fmts[] = {
          PIPE_FORMAT_R32G32B32A32_SINT
  };
  
+static void test_surface_ld_init0f(void *p, int s, int x, int y)
+{
+        float v[] = { 1.0, -.75, .50, -.25 };
+        int i = 0;
+
+        util_format_write_4f(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_ld_init0i(void *p, int s, int x, int y)
+{
+        int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+        int i = 0;
+
+        util_format_write_4i(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_ld_expectf(void *p, int s, int x, int y)
+{
+        float v[4], w[4];
+        int i = 0;
+
+        test_surface_ld_init0f(v, s, x / 4, y);
+        util_format_read_4f(surface_fmts[i], w, 0, v, 0, 0, 0, 1, 1);
+        *(float *)p = w[x % 4];
+}
+
+static void test_surface_ld_expecti(void *p, int s, int x, int y)
+{
+        int32_t v[4], w[4];
+        int i = 0;
+
+        test_surface_ld_init0i(v, s, x / 4, y);
+        util_format_read_4i(surface_fmts[i], w, 0, v, 0, 0, 0, 1, 1);
+        *(uint32_t *)p = w[x % 4];
+}
+
  static void test_surface_ld(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -1040,33 +1108,6 @@ static void test_surface_ld(struct context *ctx)
                  "       RET\n"
                  "    ENDSUB\n";
          int i = 0;
-        void init0f(void *p, int s, int x, int y) {
-                float v[] = { 1.0, -.75, .50, -.25 };
-                util_format_write_4f(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void init0i(void *p, int s, int x, int y) {
-                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
-                util_format_write_4i(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void init1(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expectf(void *p, int s, int x, int y) {
-                float v[4], w[4];
-                init0f(v, s, x / 4, y);
-                util_format_read_4f(surface_fmts[i], w, 0,
-                                    v, 0, 0, 0, 1, 1);
-                *(float *)p = w[x % 4];
-        }
-        void expecti(void *p, int s, int x, int y) {
-                int32_t v[4], w[4];
-                init0i(v, s, x / 4, y);
-                util_format_read_4i(surface_fmts[i], w, 0,
-                                    v, 0, 0, 0, 1, 1);
-                *(uint32_t *)p = w[x % 4];
-        }
  
          printf("- %s\n", __func__);
  
@@ -1085,14 +1126,14 @@ static void test_surface_ld(struct context *ctx)
                  }
  
                  init_tex(ctx, 0, PIPE_TEXTURE_2D, true, surface_fmts[i],
-                         128, 32, (is_int ? init0i : init0f));
+                         128, 32, (is_int ? test_surface_ld_init0i : test_surface_ld_init0f));
                  init_tex(ctx, 1, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                         512, 32, init1);
+                         512, 32, test_default_init);
                  init_compute_resources(ctx, (int []) { 0, 1, -1 });
                  init_sampler_states(ctx, 2);
                  launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
                              NULL);
-                check_tex(ctx, 1, (is_int ? expecti : expectf), NULL);
+                check_tex(ctx, 1, (is_int ? test_surface_ld_expecti : test_surface_ld_expectf), NULL);
                  destroy_sampler_states(ctx);
                  destroy_compute_resources(ctx);
                  destroy_tex(ctx);
@@ -1101,6 +1142,73 @@ static void test_surface_ld(struct context *ctx)
          destroy_prog(ctx);
  }
  
+/* test_surface_st */
+static void test_surface_st_init0f(void *p, int s, int x, int y)
+{
+        float v[] = { 1.0, -.75, 0.5, -.25 };
+        *(float *)p = v[x % 4];
+}
+
+static void test_surface_st_init0i(void *p, int s, int x, int y)
+{
+        int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
+        *(int32_t *)p = v[x % 4];
+}
+
+static void test_surface_st_init1(void *p, int s, int x, int y)
+{
+        int i = 0;
+        memset(p, 1, util_format_get_blocksize(surface_fmts[i]));
+}
+
+static void test_surface_st_expectf(void *p, int s, int x, int y)
+{
+        float vf[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0f(&vf[j], s, 4 * x + j, y);
+        util_format_write_4f(surface_fmts[i], vf, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_st_expects(void *p, int s, int x, int y)
+{
+        int32_t v[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0i(&v[j], s, 4 * x + j, y);
+        util_format_write_4i(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static void test_surface_st_expectu(void *p, int s, int x, int y)
+{
+        uint32_t v[4];
+        int i = 0, j;
+
+        for (j = 0; j < 4; j++)
+                test_surface_st_init0i(&v[j], s, 4 * x + j, y);
+        util_format_write_4ui(surface_fmts[i], v, 0, p, 0, 0, 0, 1, 1);
+}
+
+static bool test_surface_st_check(void *x, void *y, int sz)
+{
+        int i = 0, j;
+
+        if (util_format_is_float(surface_fmts[i])) {
+                return fabs(*(float *)x - *(float *)y) < 3.92156863e-3;
+
+        } else if ((sz % 4) == 0) {
+                for (j = 0; j < sz / 4; j++)
+                        if (abs(((uint32_t *)x)[j] -
+                                ((uint32_t *)y)[j]) > 1)
+                                return false;
+                return true;
+        } else {
+                return !memcmp(x, y, sz);
+        }
+}
+
  static void test_surface_st(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -1118,60 +1226,6 @@ static void test_surface_st(struct context *ctx)
                  "       RET\n"
                  "    ENDSUB\n";
          int i = 0;
-        void init0f(void *p, int s, int x, int y) {
-                float v[] = { 1.0, -.75, 0.5, -.25 };
-                *(float *)p = v[x % 4];
-        }
-        void init0i(void *p, int s, int x, int y) {
-                int v[] = { 0xffffffff, 0xffff, 0xff, 0xf };
-                *(int32_t *)p = v[x % 4];
-        }
-        void init1(void *p, int s, int x, int y) {
-                memset(p, 1, util_format_get_blocksize(surface_fmts[i]));
-        }
-        void expectf(void *p, int s, int x, int y) {
-                float vf[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0f(&vf[j], s, 4 * x + j, y);
-                util_format_write_4f(surface_fmts[i], vf, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void expects(void *p, int s, int x, int y) {
-                int32_t v[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0i(&v[j], s, 4 * x + j, y);
-                util_format_write_4i(surface_fmts[i], v, 0,
-                                     p, 0, 0, 0, 1, 1);
-        }
-        void expectu(void *p, int s, int x, int y) {
-                uint32_t v[4];
-                int j;
-
-                for (j = 0; j < 4; j++)
-                        init0i(&v[j], s, 4 * x + j, y);
-                util_format_write_4ui(surface_fmts[i], v, 0,
-                                      p, 0, 0, 0, 1, 1);
-        }
-        bool check(void *x, void *y, int sz) {
-                int j;
-
-                if (util_format_is_float(surface_fmts[i])) {
-                        return fabs(*(float *)x - *(float *)y) < 3.92156863e-3;
-
-                } else if ((sz % 4) == 0) {
-                        for (j = 0; j < sz / 4; j++)
-                                if (abs(((uint32_t *)x)[j] -
-                                        ((uint32_t *)y)[j]) > 1)
-                                        return false;
-                        return true;
-                } else {
-                        return !memcmp(x, y, sz);
-                }
-        }
  
          printf("- %s\n", __func__);
  
@@ -1192,16 +1246,16 @@ static void test_surface_st(struct context *ctx)
                  }
  
                  init_tex(ctx, 0, PIPE_TEXTURE_2D, true, PIPE_FORMAT_R32_FLOAT,
-                         512, 32, (is_int ? init0i : init0f));
+                         512, 32, (is_int ? test_surface_st_init0i : test_surface_st_init0f));
                  init_tex(ctx, 1, PIPE_TEXTURE_2D, true, surface_fmts[i],
-                         128, 32, init1);
+                         128, 32, test_surface_st_init1);
                  init_compute_resources(ctx, (int []) { 0, 1, -1 });
                  init_sampler_states(ctx, 2);
                  launch_grid(ctx, (uint []){1, 1, 1}, (uint []){128, 32, 1}, 0,
                              NULL);
-                check_tex(ctx, 1, (is_int && is_signed ? expects :
-                                   is_int && !is_signed ? expectu :
-                                   expectf), check);
+                check_tex(ctx, 1, (is_int && is_signed ? test_surface_st_expects :
+                                   is_int && !is_signed ? test_surface_st_expectu :
+                                   test_surface_st_expectf), test_surface_st_check);
                  destroy_sampler_states(ctx);
                  destroy_compute_resources(ctx);
                  destroy_tex(ctx);
@@ -1210,6 +1264,12 @@ static void test_surface_st(struct context *ctx)
          destroy_prog(ctx);
  }
  
+/* test_barrier */
+static void test_barrier_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 31;
+}
+
  static void test_barrier(struct context *ctx)
  {
          const char *src = "COMP\n"
@@ -1259,26 +1319,62 @@ static void test_barrier(struct context *ctx)
                  "       STORE RES[0].x, TEMP[1], TEMP[0]\n"
                  "       RET\n"
                  "    ENDSUB\n";
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 31;
-        }
  
          printf("- %s\n", __func__);
  
          init_prog(ctx, 256, 0, 0, src, NULL);
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
          init_compute_resources(ctx, (int []) { 0, -1 });
          launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_barrier_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_atom_ops */
+static void test_atom_ops_init(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = 0xbad;
+}
+
+static void test_atom_ops_expect(void *p, int s, int x, int y)
+{
+        switch (x) {
+        case 0:
+                *(uint32_t *)p = 0xce6c8eef;
+                break;
+        case 1:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 2:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 3:
+                *(uint32_t *)p = 0x10011001;
+                break;
+        case 4:
+                *(uint32_t *)p = 0xdfbdbfff;
+                break;
+        case 5:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 6:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        case 7:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 8:
+                *(uint32_t *)p = 0xdeadbeef;
+                break;
+        case 9:
+                *(uint32_t *)p = 0x11111111;
+                break;
+        }
+}
+
  static void test_atom_ops(struct context *ctx, bool global)
  {
          const char *src = "COMP\n"
@@ -1381,58 +1477,26 @@ static void test_atom_ops(struct context *ctx, bool global)
                  "       RET\n"
                  "    ENDSUB\n";
  
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xbad;
-        }
-        void expect(void *p, int s, int x, int y) {
-                switch (x) {
-                case 0:
-                        *(uint32_t *)p = 0xce6c8eef;
-                        break;
-                case 1:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 2:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 3:
-                        *(uint32_t *)p = 0x10011001;
-                        break;
-                case 4:
-                        *(uint32_t *)p = 0xdfbdbfff;
-                        break;
-                case 5:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 6:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                case 7:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 8:
-                        *(uint32_t *)p = 0xdeadbeef;
-                        break;
-                case 9:
-                        *(uint32_t *)p = 0x11111111;
-                        break;
-                }
-        }
-
          printf("- %s (%s)\n", __func__, global ? "global" : "local");
  
          init_prog(ctx, 40, 0, 0, src,
                    (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 40, 0, init);
+                 40, 0, test_atom_ops_init);
          init_compute_resources(ctx, (int []) { 0, -1 });
          launch_grid(ctx, (uint []){10, 1, 1}, (uint []){1, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_atom_ops_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
  }
  
+/* test_atom_race */
+static void test_atom_race_expect(void *p, int s, int x, int y)
+{
+        *(uint32_t *)p = x & 0x20 ? 0x11111111 : 0xffffffff;
+}
+
  static void test_atom_race(struct context *ctx, bool global)
  {
          const char *src = "COMP\n"
@@ -1551,22 +1615,15 @@ static void test_atom_race(struct context *ctx, bool global)
                  "       RET\n"
                  "    ENDSUB\n";
  
-        void init(void *p, int s, int x, int y) {
-                *(uint32_t *)p = 0xdeadbeef;
-        }
-        void expect(void *p, int s, int x, int y) {
-                *(uint32_t *)p = x & 0x20 ? 0x11111111 : 0xffffffff;
-        }
-
          printf("- %s (%s)\n", __func__, global ? "global" : "local");
  
          init_prog(ctx, 256, 0, 0, src,
                    (global ? "-DTARGET_GLOBAL" : "-DTARGET_LOCAL"));
          init_tex(ctx, 0, PIPE_BUFFER, true, PIPE_FORMAT_R32_FLOAT,
-                 4096, 0, init);
+                 4096, 0, test_default_init);
          init_compute_resources(ctx, (int []) { 0, -1 });
          launch_grid(ctx, (uint []){64, 1, 1}, (uint []){16, 1, 1}, 0, NULL);
-        check_tex(ctx, 0, expect, NULL);
+        check_tex(ctx, 0, test_atom_race_expect, NULL);
          destroy_compute_resources(ctx);
          destroy_tex(ctx);
          destroy_prog(ctx);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c

index a844773..82c803b 100644 (file)
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -686,6 +686,11 @@ error:
      return NULL;
  }
  
+static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
+{
+   return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL;
+}
+
  static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
  {
     return ((struct amdgpu_winsys_bo*)buf)->va;
@@ -701,6 +706,7 @@ void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
     ws->base.buffer_create = amdgpu_bo_create;
     ws->base.buffer_from_handle = amdgpu_bo_from_handle;
     ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
+   ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
     ws->base.buffer_get_handle = amdgpu_bo_get_handle;
     ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
     ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c

index c6603e3..c44424f 100644 (file)
--- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
@@ -13,6 +13,9 @@
  #include "nouveau/nouveau_winsys.h"
  #include "nouveau/nouveau_screen.h"
  
+#include <nvif/class.h>
+#include <nvif/cl0080.h>
+
  static struct util_hash_table *fd_tab = NULL;
  
  pipe_static_mutex(nouveau_screen_mutex);
@@ -27,7 +30,7 @@ bool nouveau_drm_screen_unref(struct nouveau_screen *screen)
         ret = --screen->refcount;
         assert(ret >= 0);
         if (ret == 0)
-               util_hash_table_remove(fd_tab, intptr_to_pointer(screen->device->fd));
+               util_hash_table_remove(fd_tab, intptr_to_pointer(screen->drm->fd));
         pipe_mutex_unlock(nouveau_screen_mutex);
         return ret == 0;
  }
@@ -57,16 +60,19 @@ static int compare_fd(void *key1, void *key2)
  PUBLIC struct pipe_screen *
  nouveau_drm_screen_create(int fd)
  {
+       struct nouveau_drm *drm = NULL;
         struct nouveau_device *dev = NULL;
-       struct pipe_screen *(*init)(struct nouveau_device *);
-       struct nouveau_screen *screen;
-       int ret, dupfd = -1;
+       struct nouveau_screen *(*init)(struct nouveau_device *);
+       struct nouveau_screen *screen = NULL;
+       int ret, dupfd;
  
         pipe_mutex_lock(nouveau_screen_mutex);
         if (!fd_tab) {
                 fd_tab = util_hash_table_create(hash_fd, compare_fd);
-               if (!fd_tab)
-                       goto err;
+               if (!fd_tab) {
+                       pipe_mutex_unlock(nouveau_screen_mutex);
+                       return NULL;
+               }
         }
  
         screen = util_hash_table_get(fd_tab, intptr_to_pointer(fd));
@@ -86,7 +92,15 @@ nouveau_drm_screen_create(int fd)
          * creation error.
          */
         dupfd = dup(fd);
-       ret = nouveau_device_wrap(dupfd, 1, &dev);
+
+       ret = nouveau_drm_new(dupfd, &drm);
+       if (ret)
+               goto err;
+
+       ret = nouveau_device_new(&drm->client, NV_DEVICE,
+                                &(struct nv_device_v0) {
+                                       .device = ~0ULL,
+                                }, sizeof(struct nv_device_v0), &dev);
         if (ret)
                 goto err;
  
@@ -116,8 +130,8 @@ nouveau_drm_screen_create(int fd)
                 goto err;
         }
  
-       screen = (struct nouveau_screen*)init(dev);
-       if (!screen)
+       screen = init(dev);
+       if (!screen || !screen->base.context_create)
                 goto err;
  
         /* Use dupfd in hash table, to avoid errors if the original fd gets
@@ -130,10 +144,13 @@ nouveau_drm_screen_create(int fd)
         return &screen->base;
  
  err:
-       if (dev)
+       if (screen) {
+               screen->base.destroy(&screen->base);
+       } else {
                 nouveau_device_del(&dev);
-       else if (dupfd >= 0)
+               nouveau_drm_del(&drm);
                 close(dupfd);
+       }
         pipe_mutex_unlock(nouveau_screen_mutex);
         return NULL;
  }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c

index ee61e54..3ec6a06 100644 (file)
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -582,7 +582,7 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,
  
              pipe_mutex_unlock(rws->bo_handles_mutex);
              pb_reference(&b, &old_bo->base);
-            return b;
+            return radeon_bo(b);
          }
  
          util_hash_table_set(rws->bo_vas, (void*)(uintptr_t)bo->va, bo);
@@ -594,7 +594,7 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,
      else if (initial_domains & RADEON_DOMAIN_GTT)
          rws->allocated_gtt += align(size, rws->size_align);
  
-    return &bo->base;
+    return bo;
  }
  
  bool radeon_bo_can_reclaim(struct pb_buffer *_buf)
@@ -768,9 +768,9 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
      usage |= 1 << (flags + 3);
  
      if (use_reusable_pool) {
-        bo = pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage);
+        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage));
          if (bo)
-            return bo;
+            return &bo->base;
      }
  
      bo = radeon_create_bo(ws, size, alignment, usage, domain, flags);
@@ -837,7 +837,7 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
      if (ws->info.r600_virtual_address) {
          struct drm_radeon_gem_va va;
  
-        bo->va = radeon_bomgr_find_va(rws, bo->base.size, 1 << 20);
+        bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20);
  
          va.handle = bo->handle;
          va.operation = RADEON_VA_MAP;
@@ -969,7 +969,7 @@ done:
      if (ws->info.r600_virtual_address && !bo->va) {
          struct drm_radeon_gem_va va;
  
-        bo->va = radeon_bomgr_find_va(rws, bo->base.size, 1 << 20);
+        bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20);
  
          va.handle = bo->handle;
          va.operation = RADEON_VA_MAP;
@@ -1052,6 +1052,11 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
      return TRUE;
  }
  
+static bool radeon_winsys_bo_is_user_ptr(struct pb_buffer *buf)
+{
+   return ((struct radeon_bo*)buf)->user_ptr != NULL;
+}
+
  static uint64_t radeon_winsys_bo_va(struct pb_buffer *buf)
  {
      return ((struct radeon_bo*)buf)->va;
@@ -1067,6 +1072,7 @@ void radeon_drm_bo_init_functions(struct radeon_drm_winsys *ws)
      ws->base.buffer_create = radeon_winsys_bo_create;
      ws->base.buffer_from_handle = radeon_winsys_bo_from_handle;
      ws->base.buffer_from_ptr = radeon_winsys_bo_from_ptr;
+    ws->base.buffer_is_user_ptr = radeon_winsys_bo_is_user_ptr;
      ws->base.buffer_get_handle = radeon_winsys_bo_get_handle;
      ws->base.buffer_get_virtual_address = radeon_winsys_bo_va;
      ws->base.buffer_get_initial_domain = radeon_bo_get_initial_domain;
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c

index 4dc3236..dae121e 100644 (file)
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -251,7 +251,7 @@ vmw_swc_flush(struct svga_winsys_context *swc,
     vswc->must_flush = FALSE;
     debug_flush_flush(vswc->fctx);
  #endif
-   swc->hints &= ~SVGA_HINT_FLAG_DRAW_EMITTED;
+   swc->hints &= ~SVGA_HINT_FLAG_CAN_PRE_FLUSH;
     vswc->preemptive_flush = FALSE;
     vswc->seen_surfaces = 0;
     vswc->seen_regions = 0;
@@ -373,7 +373,7 @@ vmw_swc_region_relocation(struct svga_winsys_context *swc,
  
     if (vmw_swc_add_validate_buffer(vswc, reloc->buffer, flags)) {
        vswc->seen_regions += reloc->buffer->size;
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
            vswc->seen_regions >= VMW_GMR_POOL_SIZE/5)
           vswc->preemptive_flush = TRUE;
     }
@@ -416,7 +416,7 @@ vmw_swc_mob_relocation(struct svga_winsys_context *swc,
     if (vmw_swc_add_validate_buffer(vswc, pb_buffer, flags)) {
        vswc->seen_mobs += pb_buffer->size;
  
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
            vswc->seen_mobs >=
              vswc->vws->ioctl.max_mob_memory / VMW_MAX_MOB_MEM_FACTOR)
           vswc->preemptive_flush = TRUE;
@@ -479,7 +479,7 @@ vmw_swc_surface_only_relocation(struct svga_winsys_context *swc,
        ++vswc->surface.staged;
  
        vswc->seen_surfaces += vsurf->size;
-      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+      if ((swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) &&
            vswc->seen_surfaces >=
              vswc->vws->ioctl.max_surface_memory / VMW_MAX_SURF_MEM_FACTOR)
           vswc->preemptive_flush = TRUE;
diff --git a/src/glsl/.gitignore b/src/glsl/.gitignore

index dda423f..e80f8af 100644 (file)
--- a/src/glsl/.gitignore
+++ b/src/glsl/.gitignore
@@ -4,6 +4,7 @@ glsl_parser.cpp
  glsl_parser.h
  glsl_parser.output
  glsl_test
+spirv2nir
  subtest-cr/
  subtest-lf/
  subtest-cr-lf/
diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am

index 33a34e4..bd6bbf2 100644 (file)
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -78,7 +78,7 @@ check_PROGRAMS =                                      \
         tests/sampler-types-test                        \
         tests/uniform-initializer-test
  
-noinst_PROGRAMS = glsl_compiler
+noinst_PROGRAMS = glsl_compiler spirv2nir
  
  tests_blob_test_SOURCES =                              \
         tests/blob_test.c
@@ -144,11 +144,13 @@ libglsl_la_SOURCES =                                      \
         glsl_parser.h                                   \
         $(LIBGLSL_FILES)                                \
         $(NIR_FILES)                                    \
-       $(NIR_GENERATED_FILES)
-
+       $(SPIRV_FILES)                                  \
+       $(NIR_GENERATED_FILES)                          \
+       $(GLSL_TO_NIR_FILES)
  
  libnir_la_SOURCES =                                    \
         $(NIR_FILES)                                    \
+       $(SPIRV_FILES)                                  \
         $(NIR_GENERATED_FILES)
  
  glsl_compiler_SOURCES = \
@@ -160,6 +162,16 @@ glsl_compiler_LDADD =                                      \
         $(top_builddir)/src/util/libmesautil.la         \
         $(PTHREAD_LIBS)
  
+spirv2nir_SOURCES = \
+       standalone_scaffolding.cpp \
+       standalone_scaffolding.h \
+       nir/spirv2nir.c
+
+spirv2nir_LDADD =                                      \
+       libglsl.la                                      \
+       $(top_builddir)/src/libglsl_util.la             \
+       $(PTHREAD_LIBS)
+
  glsl_test_SOURCES = \
         standalone_scaffolding.cpp \
         test.cpp \
@@ -247,23 +259,23 @@ PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
  
  nir/nir_builder_opcodes.h: nir/nir_opcodes.py nir/nir_builder_opcodes_h.py
         $(MKDIR_GEN)
-       $(PYTHON_GEN) $(srcdir)/nir/nir_builder_opcodes_h.py > $@
+       $(PYTHON_GEN) $(srcdir)/nir/nir_builder_opcodes_h.py > $@ || ($(RM) $@; false)
  
  nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py
         $(MKDIR_GEN)
-       $(PYTHON_GEN) $(srcdir)/nir/nir_constant_expressions.py > $@
+       $(PYTHON_GEN) $(srcdir)/nir/nir_constant_expressions.py > $@ || ($(RM) $@; false)
  
  nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py
         $(MKDIR_GEN)
-       $(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_h.py > $@
+       $(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_h.py > $@ || ($(RM) $@; false)
  
  nir/nir_opcodes.c: nir/nir_opcodes.py nir/nir_opcodes_c.py
         $(MKDIR_GEN)
-       $(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_c.py > $@
+       $(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_c.py > $@ || ($(RM) $@; false)
  
  nir/nir_opt_algebraic.c: nir/nir_opt_algebraic.py nir/nir_algebraic.py
         $(MKDIR_GEN)
-       $(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@
+       $(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@ || ($(RM) $@; false)
  
  nir_tests_control_flow_tests_SOURCES =                 \
         nir/tests/control_flow_tests.cpp
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources

index fc10f14..777abf1 100644 (file)
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -18,8 +18,6 @@ NIR_GENERATED_FILES = \
         nir/nir_opt_algebraic.c
  
  NIR_FILES = \
-       nir/glsl_to_nir.cpp \
-       nir/glsl_to_nir.h \
         nir/glsl_types.cpp \
         nir/glsl_types.h \
         nir/builtin_type_macros.h \
@@ -34,7 +32,9 @@ NIR_FILES = \
         nir/nir_control_flow_private.h \
         nir/nir_dominance.c \
         nir/nir_from_ssa.c \
+       nir/nir_gather_info.c \
         nir/nir_gs_count_vertices.c \
+       nir/nir_inline_functions.c \
         nir/nir_intrinsics.c \
         nir/nir_intrinsics.h \
         nir/nir_instr_set.c \
@@ -43,8 +43,10 @@ NIR_FILES = \
         nir/nir_lower_alu_to_scalar.c \
         nir/nir_lower_atomics.c \
         nir/nir_lower_clip.c \
+       nir/nir_lower_returns.c \
         nir/nir_lower_global_vars_to_local.c \
         nir/nir_lower_gs_intrinsics.c \
+        nir/nir_lower_indirect_derefs.c \
         nir/nir_lower_load_const_to_scalar.c \
         nir/nir_lower_locals_to_regs.c \
         nir/nir_lower_idiv.c \
@@ -72,8 +74,11 @@ NIR_FILES = \
         nir/nir_opt_peephole_select.c \
         nir/nir_opt_remove_phis.c \
         nir/nir_opt_undef.c \
+       nir/nir_phi_builder.c \
+       nir/nir_phi_builder.h \
         nir/nir_print.c \
         nir/nir_remove_dead_variables.c \
+       nir/nir_repair_ssa.c \
         nir/nir_search.c \
         nir/nir_search.h \
         nir/nir_split_var_copies.c \
@@ -88,6 +93,14 @@ NIR_FILES = \
         nir/shader_enums.h \
         nir/shader_enums.c
  
+SPIRV_FILES = \
+       nir/spirv/nir_spirv.h \
+       nir/spirv/spirv_to_nir.c \
+       nir/spirv/vtn_alu.c \
+       nir/spirv/vtn_cfg.c \
+       nir/spirv/vtn_glsl450.c \
+       nir/spirv/vtn_private.h
+
  # libglsl
  
  LIBGLSL_FILES = \
@@ -213,6 +226,11 @@ LIBGLSL_FILES = \
         s_expression.cpp \
         s_expression.h
  
+# glsl to nir pass
+GLSL_TO_NIR_FILES = \
+       nir/glsl_to_nir.cpp \
+       nir/glsl_to_nir.h
+
  # glsl_compiler
  
  GLSL_COMPILER_CXX_FILES = \
diff --git a/src/glsl/SConscript b/src/glsl/SConscript

index 70bf5b0..a9d38c1 100644 (file)
--- a/src/glsl/SConscript
+++ b/src/glsl/SConscript
@@ -65,6 +65,7 @@ for l in ('LIBGLCPP_FILES', 'LIBGLSL_FILES'):
  # XXX: Remove this once we build NIR and NIR_FILES.
  glsl_sources += [
      'nir/glsl_types.cpp',
+    'nir/shader_enums.c',
  ]
  
  if env['msvc']:
diff --git a/src/glsl/ast.h b/src/glsl/ast.h

index adfc793..f8ab0b7 100644 (file)
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -726,6 +726,7 @@ public:
                           struct _mesa_glsl_parse_state *state);
  
     const char *name;
+   ast_type_qualifier *layout;
     /* List of ast_declarator_list * */
     exec_list declarations;
     bool is_declaration;
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp

index 35a1e13..02e6e2d 100644 (file)
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -106,6 +106,15 @@ public:
        return found;
     }
  
+   virtual ir_visitor_status visit_enter(ir_expression *ir)
+   {
+      /* .length() doesn't actually read anything */
+      if (ir->operation == ir_unop_ssbo_unsized_array_length)
+         return visit_continue_with_parent;
+
+      return visit_continue;
+   }
+
  private:
     ir_variable *found;
  };
@@ -1088,6 +1097,7 @@ do_comparison(void *mem_ctx, int operation, ir_rvalue *op0, ir_rvalue *op1)
     case GLSL_TYPE_SAMPLER:
     case GLSL_TYPE_IMAGE:
     case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_FUNCTION:
     case GLSL_TYPE_ATOMIC_UINT:
     case GLSL_TYPE_SUBROUTINE:
        /* I assume a comparison of a struct containing a sampler just
@@ -3105,7 +3115,7 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
                 _mesa_glsl_error(loc, state,
                                  "misaligned atomic counter offset");
  
-            var->data.atomic.offset = *offset;
+            var->data.offset = *offset;
              *offset += var->type->atomic_size();
  
           } else {
@@ -3517,7 +3527,7 @@ get_variable_being_redeclared(ir_variable *var, YYLTYPE loc,
                state->is_version(150, 0))
                && strcmp(var->name, "gl_FragCoord") == 0
                && earlier->type == var->type
-              && earlier->data.mode == var->data.mode) {
+              && var->data.mode == ir_var_shader_in) {
        /* Allow redeclaration of gl_FragCoord for ARB_fcc layout
         * qualifiers.
         */
@@ -6169,7 +6179,7 @@ ast_type_specifier::hir(exec_list *instructions,
   * The number of fields processed.  A pointer to the array structure fields is
   * stored in \c *fields_ret.
   */
-unsigned
+static unsigned
  ast_process_struct_or_iface_block_members(exec_list *instructions,
                                            struct _mesa_glsl_parse_state *state,
                                            exec_list *declarations,
@@ -6179,7 +6189,8 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                            bool allow_reserved_names,
                                            ir_variable_mode var_mode,
                                            ast_type_qualifier *layout,
-                                          unsigned block_stream)
+                                          unsigned block_stream,
+                                          unsigned expl_location)
  {
     unsigned decl_count = 0;
  
@@ -6200,6 +6211,9 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
     glsl_struct_field *const fields = ralloc_array(state, glsl_struct_field,
                                                    decl_count);
  
+   bool first_member = true;
+   bool first_member_has_explicit_location;
+
     unsigned i = 0;
     foreach_list_typed (ast_declarator_list, decl_list, link, declarations) {
        const char *type_name;
@@ -6264,6 +6278,27 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                            "to struct or interface block members");
        }
  
+      if (is_interface) {
+         if (!first_member) {
+            if (!layout->flags.q.explicit_location &&
+                ((first_member_has_explicit_location &&
+                  !qual->flags.q.explicit_location) ||
+                 (!first_member_has_explicit_location &&
+                  qual->flags.q.explicit_location))) {
+               _mesa_glsl_error(&loc, state,
+                                "when block-level location layout qualifier "
+                                "is not supplied either all members must "
+                                "have a location layout qualifier or all "
+                                "members must not have a location layout "
+                                "qualifier");
+            }
+         } else {
+            first_member = false;
+            first_member_has_explicit_location =
+               qual->flags.q.explicit_location;
+         }
+      }
+
        if (qual->flags.q.std140 ||
            qual->flags.q.std430 ||
            qual->flags.q.packed ||
@@ -6338,7 +6373,6 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
           validate_array_dimensions(field_type, state, &loc);
           fields[i].type = field_type;
           fields[i].name = decl->identifier;
-         fields[i].location = -1;
           fields[i].interpolation =
              interpret_interpolation_qualifier(qual, var_mode, state, &loc);
           fields[i].centroid = qual->flags.q.centroid ? 1 : 0;
@@ -6346,6 +6380,23 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
           fields[i].patch = qual->flags.q.patch ? 1 : 0;
           fields[i].precision = qual->precision;
  
+         if (qual->flags.q.explicit_location) {
+            unsigned qual_location;
+            if (process_qualifier_constant(state, &loc, "location",
+                                           qual->location, &qual_location)) {
+               fields[i].location = VARYING_SLOT_VAR0 + qual_location;
+               expl_location = fields[i].location +
+                  fields[i].type->count_attribute_slots(false);
+            }
+         } else {
+            if (layout && layout->flags.q.explicit_location) {
+               fields[i].location = expl_location;
+               expl_location += fields[i].type->count_attribute_slots(false);
+            } else {
+               fields[i].location = -1;
+            }
+         }
+
           /* Propogate row- / column-major information down the fields of the
            * structure or interface block.  Structures need this data because
            * the structure may contain a structure that contains ... a matrix
@@ -6444,6 +6495,16 @@ ast_struct_specifier::hir(exec_list *instructions,
  
     state->struct_specifier_depth++;
  
+   unsigned expl_location = 0;
+   if (layout && layout->flags.q.explicit_location) {
+      if (!process_qualifier_constant(state, &loc, "location",
+                                      layout->location, &expl_location)) {
+         return NULL;
+      } else {
+         expl_location = VARYING_SLOT_VAR0 + expl_location;
+      }
+   }
+
     glsl_struct_field *fields;
     unsigned decl_count =
        ast_process_struct_or_iface_block_members(instructions,
@@ -6454,8 +6515,9 @@ ast_struct_specifier::hir(exec_list *instructions,
                                                  GLSL_MATRIX_LAYOUT_INHERITED,
                                                  false /* allow_reserved_names */,
                                                  ir_var_auto,
-                                                NULL,
-                                                0 /* for interface only */);
+                                                layout,
+                                                0, /* for interface only */
+                                                expl_location);
  
     validate_identifier(this->name, loc, state);
  
@@ -6620,6 +6682,16 @@ ast_interface_block::hir(exec_list *instructions,
        return NULL;
     }
  
+   unsigned expl_location = 0;
+   if (layout.flags.q.explicit_location) {
+      if (!process_qualifier_constant(state, &loc, "location",
+                                      layout.location, &expl_location)) {
+         return NULL;
+      } else {
+         expl_location = VARYING_SLOT_VAR0 + expl_location;
+      }
+   }
+
     unsigned int num_variables =
        ast_process_struct_or_iface_block_members(&declared_variables,
                                                  state,
@@ -6630,7 +6702,8 @@ ast_interface_block::hir(exec_list *instructions,
                                                  redeclaring_per_vertex,
                                                  var_mode,
                                                  &this->layout,
-                                                qual_stream);
+                                                qual_stream,
+                                                expl_location);
  
     state->struct_specifier_depth--;
  
@@ -6969,6 +7042,10 @@ ast_interface_block::hir(exec_list *instructions,
           }
  
           var->data.stream = qual_stream;
+         if (layout.flags.q.explicit_location) {
+            var->data.location = expl_location;
+            var->data.explicit_location = true;
+         }
  
           state->symbols->add_variable(var);
           instructions->push_tail(var);
@@ -6989,6 +7066,9 @@ ast_interface_block::hir(exec_list *instructions,
           var->data.sample = fields[i].sample;
           var->data.patch = fields[i].patch;
           var->data.stream = qual_stream;
+         var->data.location = fields[i].location;
+         if (fields[i].location != -1)
+            var->data.explicit_location = true;
           var->init_interface_type(block_type);
  
           if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp

index 9973a76..f2e2165 100644 (file)
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -136,6 +136,12 @@ v140(const _mesa_glsl_parse_state *state)
  }
  
  static bool
+v140_or_es3(const _mesa_glsl_parse_state *state)
+{
+   return state->is_version(140, 300);
+}
+
+static bool
  v400_fs_only(const _mesa_glsl_parse_state *state)
  {
     return state->is_version(400, 0) &&
@@ -544,6 +550,7 @@ private:
     ir_variable *in_var(const glsl_type *type, const char *name);
     ir_variable *out_var(const glsl_type *type, const char *name);
     ir_constant *imm(float f, unsigned vector_elements=1);
+   ir_constant *imm(bool b, unsigned vector_elements=1);
     ir_constant *imm(int i, unsigned vector_elements=1);
     ir_constant *imm(unsigned u, unsigned vector_elements=1);
     ir_constant *imm(double d, unsigned vector_elements=1);
@@ -1438,9 +1445,9 @@ builtin_builder::create_builtins()
  
                  NULL);
     add_function("inverse",
-                _inverse_mat2(v120, glsl_type::mat2_type),
-                _inverse_mat3(v120, glsl_type::mat3_type),
-                _inverse_mat4(v120, glsl_type::mat4_type),
+                _inverse_mat2(v140_or_es3, glsl_type::mat2_type),
+                _inverse_mat3(v140_or_es3, glsl_type::mat3_type),
+                _inverse_mat4(v140_or_es3, glsl_type::mat4_type),
                  _inverse_mat2(fp64, glsl_type::dmat2_type),
                  _inverse_mat3(fp64, glsl_type::dmat3_type),
                  _inverse_mat4(fp64, glsl_type::dmat4_type),
@@ -3006,6 +3013,12 @@ builtin_builder::out_var(const glsl_type *type, const char *name)
  }
  
  ir_constant *
+builtin_builder::imm(bool b, unsigned vector_elements)
+{
+   return new(mem_ctx) ir_constant(b, vector_elements);
+}
+
+ir_constant *
  builtin_builder::imm(float f, unsigned vector_elements)
  {
     return new(mem_ctx) ir_constant(f, vector_elements);
@@ -4364,7 +4377,13 @@ builtin_builder::_notEqual(builtin_available_predicate avail,
  ir_function_signature *
  builtin_builder::_any(const glsl_type *type)
  {
-   return unop(always_available, ir_unop_any, glsl_type::bool_type, type);
+   ir_variable *v = in_var(type, "v");
+   MAKE_SIG(glsl_type::bool_type, always_available, 1, v);
+
+   const unsigned vec_elem = v->type->vector_elements;
+   body.emit(ret(expr(ir_binop_any_nequal, v, imm(false, vec_elem))));
+
+   return sig;
  }
  
  ir_function_signature *
@@ -4373,20 +4392,8 @@ builtin_builder::_all(const glsl_type *type)
     ir_variable *v = in_var(type, "v");
     MAKE_SIG(glsl_type::bool_type, always_available, 1, v);
  
-   switch (type->vector_elements) {
-   case 2:
-      body.emit(ret(logic_and(swizzle_x(v), swizzle_y(v))));
-      break;
-   case 3:
-      body.emit(ret(logic_and(logic_and(swizzle_x(v), swizzle_y(v)),
-                              swizzle_z(v))));
-      break;
-   case 4:
-      body.emit(ret(logic_and(logic_and(logic_and(swizzle_x(v), swizzle_y(v)),
-                                        swizzle_z(v)),
-                              swizzle_w(v))));
-      break;
-   }
+   const unsigned vec_elem = v->type->vector_elements;
+   body.emit(ret(expr(ir_binop_all_equal, v, imm(true, vec_elem))));
  
     return sig;
  }
@@ -4882,12 +4889,18 @@ builtin_builder::_noise4(const glsl_type *type)
  ir_function_signature *
  builtin_builder::_bitfieldExtract(const glsl_type *type)
  {
+   bool is_uint = type->base_type == GLSL_TYPE_UINT;
     ir_variable *value  = in_var(type, "value");
     ir_variable *offset = in_var(glsl_type::int_type, "offset");
     ir_variable *bits   = in_var(glsl_type::int_type, "bits");
     MAKE_SIG(type, gpu_shader5_or_es31, 3, value, offset, bits);
  
-   body.emit(ret(expr(ir_triop_bitfield_extract, value, offset, bits)));
+   operand cast_offset = is_uint ? i2u(offset) : operand(offset);
+   operand cast_bits = is_uint ? i2u(bits) : operand(bits);
+
+   body.emit(ret(expr(ir_triop_bitfield_extract, value,
+      swizzle(cast_offset, SWIZZLE_XXXX, type->vector_elements),
+      swizzle(cast_bits, SWIZZLE_XXXX, type->vector_elements))));
  
     return sig;
  }
@@ -4895,13 +4908,19 @@ builtin_builder::_bitfieldExtract(const glsl_type *type)
  ir_function_signature *
  builtin_builder::_bitfieldInsert(const glsl_type *type)
  {
+   bool is_uint = type->base_type == GLSL_TYPE_UINT;
     ir_variable *base   = in_var(type, "base");
     ir_variable *insert = in_var(type, "insert");
     ir_variable *offset = in_var(glsl_type::int_type, "offset");
     ir_variable *bits   = in_var(glsl_type::int_type, "bits");
     MAKE_SIG(type, gpu_shader5_or_es31, 4, base, insert, offset, bits);
  
-   body.emit(ret(bitfield_insert(base, insert, offset, bits)));
+   operand cast_offset = is_uint ? i2u(offset) : operand(offset);
+   operand cast_bits = is_uint ? i2u(bits) : operand(bits);
+
+   body.emit(ret(bitfield_insert(base, insert,
+      swizzle(cast_offset, SWIZZLE_XXXX, type->vector_elements),
+      swizzle(cast_bits, SWIZZLE_XXXX, type->vector_elements))));
  
     return sig;
  }
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp

index e8eab80..221aab0 100644 (file)
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -951,6 +951,11 @@ builtin_variable_generator::generate_vs_special_vars()
        add_system_value(SYSTEM_VALUE_INSTANCE_ID, int_t, "gl_InstanceIDARB");
     if (state->ARB_draw_instanced_enable || state->is_version(140, 300))
        add_system_value(SYSTEM_VALUE_INSTANCE_ID, int_t, "gl_InstanceID");
+   if (state->ARB_shader_draw_parameters_enable) {
+      add_system_value(SYSTEM_VALUE_BASE_VERTEX, int_t, "gl_BaseVertexARB");
+      add_system_value(SYSTEM_VALUE_BASE_INSTANCE, int_t, "gl_BaseInstanceARB");
+      add_system_value(SYSTEM_VALUE_DRAW_ID, int_t, "gl_DrawIDARB");
+   }
     if (state->AMD_vertex_shader_layer_enable) {
        var = add_output(VARYING_SLOT_LAYER, int_t, "gl_Layer");
        var->data.interpolation = INTERP_QUALIFIER_FLAT;
@@ -1052,8 +1057,16 @@ builtin_variable_generator::generate_fs_special_vars()
  {
     ir_variable *var;
  
-   add_input(VARYING_SLOT_POS, vec4_t, "gl_FragCoord");
-   add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing");
+   if (this->state->ctx->Const.GLSLFragCoordIsSysVal)
+      add_system_value(SYSTEM_VALUE_FRAG_COORD, vec4_t, "gl_FragCoord");
+   else
+      add_input(VARYING_SLOT_POS, vec4_t, "gl_FragCoord");
+
+   if (this->state->ctx->Const.GLSLFrontFacingIsSysVal)
+      add_system_value(SYSTEM_VALUE_FRONT_FACE, bool_t, "gl_FrontFacing");
+   else
+      add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing");
+
     if (state->is_version(120, 100))
        add_input(VARYING_SLOT_PNTC, vec2_t, "gl_PointCoord");
  
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y

index 2fd4cf0..ef1a657 100644 (file)
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2506,6 +2506,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
  
                if (extensions->ARB_shader_subroutine)
                   add_builtin_define(parser, "GL_ARB_shader_subroutine", 1);
+
+              if (extensions->ARB_shader_draw_parameters)
+                 add_builtin_define(parser, "GL_ARB_shader_draw_parameters", 1);
            }
         }
  
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy

index 7eb383a..51796a6 100644 (file)
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1130,6 +1130,10 @@ fully_specified_type:
        $$->set_location_range(@1, @2);
        $$->qualifier = $1;
        $$->specifier = $2;
+      if ($$->specifier->structure != NULL &&
+          $$->specifier->structure->is_declaration) {
+            $$->specifier->structure->layout = &$$->qualifier;
+      }
     }
     ;
  
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp

index 29cf0c6..131a564 100644 (file)
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -87,6 +87,8 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
  
     this->extensions = &ctx->Extensions;
  
+   this->ARB_compute_shader_enable = true;
+
     this->Const.MaxLights = ctx->Const.MaxLights;
     this->Const.MaxClipPlanes = ctx->Const.MaxClipPlanes;
     this->Const.MaxTextureUnits = ctx->Const.MaxTextureUnits;
@@ -412,44 +414,6 @@ _mesa_glsl_parse_state::process_version_directive(YYLTYPE *locp, int version,
  }
  
  
-/**
- * Translate a gl_shader_stage to a short shader stage name for debug
- * printouts and error messages.
- */
-const char *
-_mesa_shader_stage_to_string(unsigned stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:   return "vertex";
-   case MESA_SHADER_FRAGMENT: return "fragment";
-   case MESA_SHADER_GEOMETRY: return "geometry";
-   case MESA_SHADER_COMPUTE:  return "compute";
-   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
-   case MESA_SHADER_TESS_EVAL: return "tess eval";
-   }
-
-   unreachable("Unknown shader stage.");
-}
-
-/**
- * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
- * for debug printouts and error messages.
- */
-const char *
-_mesa_shader_stage_to_abbrev(unsigned stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:   return "VS";
-   case MESA_SHADER_FRAGMENT: return "FS";
-   case MESA_SHADER_GEOMETRY: return "GS";
-   case MESA_SHADER_COMPUTE:  return "CS";
-   case MESA_SHADER_TESS_CTRL: return "TCS";
-   case MESA_SHADER_TESS_EVAL: return "TES";
-   }
-
-   unreachable("Unknown shader stage.");
-}
-
  /* This helper function will append the given message to the shader's
     info log and report it via GL_ARB_debug_output. Per that extension,
     'type' is one of the enum values classifying the message, and
@@ -608,6 +572,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
     EXT(ARB_shader_atomic_counters,       true,  false,     ARB_shader_atomic_counters),
     EXT(ARB_shader_bit_encoding,          true,  false,     ARB_shader_bit_encoding),
     EXT(ARB_shader_clock,                 true,  false,     ARB_shader_clock),
+   EXT(ARB_shader_draw_parameters,       true,  false,     ARB_shader_draw_parameters),
     EXT(ARB_shader_image_load_store,      true,  false,     ARB_shader_image_load_store),
     EXT(ARB_shader_image_size,            true,  false,     ARB_shader_image_size),
     EXT(ARB_shader_precision,             true,  false,     ARB_shader_precision),
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h

index a4bda77..ecc2992 100644 (file)
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -536,6 +536,8 @@ struct _mesa_glsl_parse_state {
     bool ARB_shader_bit_encoding_warn;
     bool ARB_shader_clock_enable;
     bool ARB_shader_clock_warn;
+   bool ARB_shader_draw_parameters_enable;
+   bool ARB_shader_draw_parameters_warn;
     bool ARB_shader_image_load_store_enable;
     bool ARB_shader_image_load_store_warn;
     bool ARB_shader_image_size_enable;
@@ -729,16 +731,6 @@ extern bool _mesa_glsl_process_extension(const char *name, YYLTYPE *name_locp,
  extern "C" {
  #endif
  
-/**
- * Get the textual name of the specified shader stage (which is a
- * gl_shader_stage).
- */
-extern const char *
-_mesa_shader_stage_to_string(unsigned stage);
-
-extern const char *
-_mesa_shader_stage_to_abbrev(unsigned stage);
-
  extern int glcpp_preprocess(void *ctx, const char **shader, char **info_log,
                        const struct gl_extensions *extensions, struct gl_context *gl_ctx);
  
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp

index f989e9b..b424edd 100644 (file)
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -307,10 +307,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
        this->type = glsl_type::uvec2_type;
        break;
  
-   case ir_unop_any:
-      this->type = glsl_type::bool_type;
-      break;
-
     case ir_unop_pack_snorm_2x16:
     case ir_unop_pack_snorm_4x8:
     case ir_unop_pack_unorm_2x16:
@@ -435,7 +431,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1)
     case ir_binop_borrow:
     case ir_binop_lshift:
     case ir_binop_rshift:
-   case ir_binop_bfm:
     case ir_binop_ldexp:
     case ir_binop_interpolate_at_offset:
     case ir_binop_interpolate_at_sample:
@@ -472,7 +467,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1,
        this->type = op0->type;
        break;
  
-   case ir_triop_bfi:
     case ir_triop_csel:
        this->type = op1->type;
        break;
@@ -538,7 +532,6 @@ static const char *const operator_strs[] = {
     "bitcast_f2i",
     "bitcast_u2f",
     "bitcast_f2u",
-   "any",
     "trunc",
     "ceil",
     "floor",
@@ -607,7 +600,6 @@ static const char *const operator_strs[] = {
     "max",
     "pow",
     "packHalf2x16_split",
-   "bfm",
     "ubo_load",
     "ldexp",
     "vector_extract",
@@ -616,7 +608,6 @@ static const char *const operator_strs[] = {
     "fma",
     "lrp",
     "csel",
-   "bfi",
     "bitfield_extract",
     "vector_insert",
     "bitfield_insert",
@@ -1679,7 +1670,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
     this->data.mode = mode;
     this->data.interpolation = INTERP_QUALIFIER_NONE;
     this->data.max_array_access = 0;
-   this->data.atomic.offset = 0;
+   this->data.offset = 0;
     this->data.precision = GLSL_PRECISION_NONE;
     this->data.image_read_only = false;
     this->data.image_write_only = false;
diff --git a/src/glsl/ir.h b/src/glsl/ir.h

index bdc932e..5b845c6 100644 (file)
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -871,9 +871,7 @@ public:
        /**
         * Location an atomic counter is stored at.
         */
-      struct {
-         unsigned offset;
-      } atomic;
+      unsigned offset;
  
        /**
         * Highest element accessed with a constant expression array index
@@ -1355,7 +1353,6 @@ enum ir_expression_operation {
     ir_unop_bitcast_f2i, /**< Bit-identical float-to-int "conversion" */
     ir_unop_bitcast_u2f, /**< Bit-identical uint-to-float "conversion" */
     ir_unop_bitcast_f2u, /**< Bit-identical float-to-uint "conversion" */
-   ir_unop_any,
  
     /**
      * \name Unary floating-point rounding operations.
@@ -1554,15 +1551,6 @@ enum ir_expression_operation {
     /*@}*/
  
     /**
-    * \name First half of a lowered bitfieldInsert() operation.
-    *
-    * \see lower_instructions::bitfield_insert_to_bfm_bfi
-    */
-   /*@{*/
-   ir_binop_bfm,
-   /*@}*/
-
-   /**
      * Load a value the size of a given GLSL type from a uniform block.
      *
      * operand0 is the ir_constant uniform block index in the linked shader.
@@ -1627,15 +1615,6 @@ enum ir_expression_operation {
     ir_triop_csel,
     /*@}*/
  
-   /**
-    * \name Second half of a lowered bitfieldInsert() operation.
-    *
-    * \see lower_instructions::bitfield_insert_to_bfm_bfi
-    */
-   /*@{*/
-   ir_triop_bfi,
-   /*@}*/
-
     ir_triop_bitfield_extract,
  
     /**
@@ -1726,8 +1705,9 @@ public:
     {
        return operation == ir_binop_all_equal ||
               operation == ir_binop_any_nequal ||
-             operation == ir_unop_any ||
               operation == ir_binop_dot ||
+             operation == ir_binop_vector_extract ||
+             operation == ir_triop_vector_insert ||
               operation == ir_quadop_vector;
     }
  
diff --git a/src/glsl/ir_clone.cpp b/src/glsl/ir_clone.cpp

index bee60a2..2aef4fc 100644 (file)
--- a/src/glsl/ir_clone.cpp
+++ b/src/glsl/ir_clone.cpp
@@ -366,6 +366,7 @@ ir_constant::clone(void *mem_ctx, struct hash_table *ht) const
        return c;
     }
  
+   case GLSL_TYPE_FUNCTION:
     case GLSL_TYPE_SAMPLER:
     case GLSL_TYPE_IMAGE:
     case GLSL_TYPE_ATOMIC_UINT:
diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp

index ef70585..c99a823 100644 (file)
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -648,14 +648,6 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
          data.u[c] = bitcast_f2u(op[0]->value.f[c]);
        }
        break;
-   case ir_unop_any:
-      assert(op[0]->type->is_boolean());
-      data.b[0] = false;
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-        if (op[0]->value.b[c])
-           data.b[0] = true;
-      }
-      break;
     case ir_unop_d2f:
        assert(op[0]->type->base_type == GLSL_TYPE_DOUBLE);
        for (unsigned c = 0; c < op[0]->type->components(); c++) {
@@ -1547,10 +1539,10 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
              data.i[c] = -1;
           else {
              int count = 0;
-            int top_bit = op[0]->type->base_type == GLSL_TYPE_UINT
-                          ? 0 : v & (1 << 31);
+            unsigned top_bit = op[0]->type->base_type == GLSL_TYPE_UINT
+                               ? 0 : v & (1u << 31);
  
-            while (((v & (1 << 31)) == top_bit) && count != 32) {
+            while (((v & (1u << 31)) == top_bit) && count != 32) {
                 count++;
                 v <<= 1;
              }
@@ -1596,10 +1588,10 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
        break;
  
     case ir_triop_bitfield_extract: {
-      int offset = op[1]->value.i[0];
-      int bits = op[2]->value.i[0];
-
        for (unsigned c = 0; c < components; c++) {
+         int offset = op[1]->value.i[c];
+         int bits = op[2]->value.i[c];
+
           if (bits == 0)
              data.u[c] = 0;
           else if (offset < 0 || bits < 0)
@@ -1624,23 +1616,6 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
        break;
     }
  
-   case ir_binop_bfm: {
-      int bits = op[0]->value.i[0];
-      int offset = op[1]->value.i[0];
-
-      for (unsigned c = 0; c < components; c++) {
-         if (bits == 0)
-            data.u[c] = op[0]->value.u[c];
-         else if (offset < 0 || bits < 0)
-            data.u[c] = 0; /* Undefined for bitfieldInsert, per spec. */
-         else if (offset + bits > 32)
-            data.u[c] = 0; /* Undefined for bitfieldInsert, per spec. */
-         else
-            data.u[c] = ((1 << bits) - 1) << offset;
-      }
-      break;
-   }
-
     case ir_binop_ldexp:
        for (unsigned c = 0; c < components; c++) {
           if (op[0]->type->base_type == GLSL_TYPE_DOUBLE) {
@@ -1735,10 +1710,10 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
     }
  
     case ir_quadop_bitfield_insert: {
-      int offset = op[2]->value.i[0];
-      int bits = op[3]->value.i[0];
-
        for (unsigned c = 0; c < components; c++) {
+         int offset = op[2]->value.i[c];
+         int bits = op[3]->value.i[c];
+
           if (bits == 0)
              data.u[c] = op[0]->value.u[c];
           else if (offset < 0 || bits < 0)
@@ -1746,7 +1721,7 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
           else if (offset + bits > 32)
              data.u[c] = 0; /* Undefined, per spec. */
           else {
-            unsigned insert_mask = ((1 << bits) - 1) << offset;
+            unsigned insert_mask = ((1ull << bits) - 1) << offset;
  
              unsigned insert = op[1]->value.u[c];
              insert <<= offset;
@@ -1832,9 +1807,7 @@ ir_swizzle::constant_expression_value(struct hash_table *variable_context)
  ir_constant *
  ir_dereference_variable::constant_expression_value(struct hash_table *variable_context)
  {
-   /* This may occur during compile and var->type is glsl_type::error_type */
-   if (!var)
-      return NULL;
+   assert(var);
  
     /* Give priority to the context hashtable, if it exists */
     if (variable_context) {
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h

index dabd80a..be86f54 100644 (file)
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -36,13 +36,12 @@
  #define LOG_TO_LOG2        0x10
  #define MOD_TO_FLOOR       0x20
  #define INT_DIV_TO_MUL_RCP 0x40
-#define BITFIELD_INSERT_TO_BFM_BFI 0x80
-#define LDEXP_TO_ARITH     0x100
-#define CARRY_TO_ARITH     0x200
-#define BORROW_TO_ARITH    0x400
-#define SAT_TO_CLAMP       0x800
-#define DOPS_TO_DFRAC      0x1000
-#define DFREXP_DLDEXP_TO_ARITH    0x2000
+#define LDEXP_TO_ARITH     0x80
+#define CARRY_TO_ARITH     0x100
+#define BORROW_TO_ARITH    0x200
+#define SAT_TO_CLAMP       0x400
+#define DOPS_TO_DFRAC      0x800
+#define DFREXP_DLDEXP_TO_ARITH    0x1000
  
  /**
   * \see class lower_packing_builtins_visitor
diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp

index d7c29b0..a2dea67 100644 (file)
--- a/src/glsl/ir_set_program_inouts.cpp
+++ b/src/glsl/ir_set_program_inouts.cpp
@@ -81,16 +81,9 @@ is_shader_inout(ir_variable *var)
            var->data.mode == ir_var_system_value;
  }
  
-static inline bool
-is_dual_slot(ir_variable *var)
-{
-   const glsl_type *type = var->type->without_array();
-   return type == glsl_type::dvec4_type || type == glsl_type::dvec3_type;
-}
-
  static void
  mark(struct gl_program *prog, ir_variable *var, int offset, int len,
-     bool is_fragment_shader)
+     gl_shader_stage stage)
  {
     /* As of GLSL 1.20, varyings can only be floats, floating-point
      * vectors or matrices, or arrays of them.  For Mesa programs using
@@ -101,7 +94,6 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
      */
  
     for (int i = 0; i < len; i++) {
-      bool dual_slot = is_dual_slot(var);
        int idx = var->data.location + var->data.index + offset + i;
        bool is_patch_generic = var->data.patch &&
                                idx != VARYING_SLOT_TESS_LEVEL_INNER &&
@@ -123,9 +115,12 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
           else
              prog->InputsRead |= bitfield;
  
-         if (dual_slot)
+         /* double inputs read is only for vertex inputs */
+         if (stage == MESA_SHADER_VERTEX &&
+             var->type->without_array()->is_dual_slot_double())
              prog->DoubleInputsRead |= bitfield;
-         if (is_fragment_shader) {
+
+         if (stage == MESA_SHADER_FRAGMENT) {
              gl_fragment_program *fprog = (gl_fragment_program *) prog;
              fprog->InterpQualifier[idx] =
                 (glsl_interp_qualifier) var->data.interpolation;
@@ -154,6 +149,7 @@ void
  ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var)
  {
     const glsl_type *type = var->type;
+   bool vertex_input = false;
     if (this->shader_stage == MESA_SHADER_GEOMETRY &&
         var->data.mode == ir_var_shader_in && type->is_array()) {
        type = type->fields.array;
@@ -177,8 +173,12 @@ ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var)
        type = type->fields.array;
     }
  
-   mark(this->prog, var, 0, type->count_attribute_slots(),
-        this->shader_stage == MESA_SHADER_FRAGMENT);
+   if (this->shader_stage == MESA_SHADER_VERTEX &&
+       var->data.mode == ir_var_shader_in)
+      vertex_input = true;
+
+   mark(this->prog, var, 0, type->count_attribute_slots(vertex_input),
+        this->shader_stage);
  }
  
  /* Default handler: Mark all the locations in the variable as used. */
@@ -301,8 +301,15 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
        return false;
     }
  
+   /* double element width for double types that takes two slots */
+   if (this->shader_stage != MESA_SHADER_VERTEX ||
+       var->data.mode != ir_var_shader_in) {
+      if (type->without_array()->is_dual_slot_double())
+        elem_width *= 2;
+   }
+
     mark(this->prog, var, index_as_constant->value.u[0] * elem_width,
-        elem_width, this->shader_stage == MESA_SHADER_FRAGMENT);
+        elem_width, this->shader_stage);
     return true;
  }
  
diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp

index e63b5c3..9481479 100644 (file)
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -320,11 +320,6 @@ ir_validate::visit_leave(ir_expression *ir)
        assert(ir->type->base_type == GLSL_TYPE_UINT);
        break;
  
-   case ir_unop_any:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL);
-      assert(ir->type == glsl_type::bool_type);
-      break;
-
     case ir_unop_trunc:
     case ir_unop_round_even:
     case ir_unop_ceil:
@@ -578,12 +573,6 @@ ir_validate::visit_leave(ir_expression *ir)
        assert(ir->operands[1]->type == glsl_type::float_type);
        break;
  
-   case ir_binop_bfm:
-      assert(ir->type->is_integer());
-      assert(ir->operands[0]->type->is_integer());
-      assert(ir->operands[1]->type->is_integer());
-      break;
-
     case ir_binop_ubo_load:
        assert(ir->operands[0]->type == glsl_type::uint_type);
  
@@ -642,16 +631,11 @@ ir_validate::visit_leave(ir_expression *ir)
        assert(ir->type == ir->operands[2]->type);
        break;
  
-   case ir_triop_bfi:
-      assert(ir->operands[0]->type->is_integer());
-      assert(ir->operands[1]->type == ir->operands[2]->type);
-      assert(ir->operands[1]->type == ir->type);
-      break;
-
     case ir_triop_bitfield_extract:
+      assert(ir->type->is_integer());
        assert(ir->operands[0]->type == ir->type);
-      assert(ir->operands[1]->type == glsl_type::int_type);
-      assert(ir->operands[2]->type == glsl_type::int_type);
+      assert(ir->operands[1]->type == ir->type);
+      assert(ir->operands[2]->type == ir->type);
        break;
  
     case ir_triop_vector_insert:
@@ -664,10 +648,11 @@ ir_validate::visit_leave(ir_expression *ir)
        break;
  
     case ir_quadop_bitfield_insert:
+      assert(ir->type->is_integer());
        assert(ir->operands[0]->type == ir->type);
        assert(ir->operands[1]->type == ir->type);
-      assert(ir->operands[2]->type == glsl_type::int_type);
-      assert(ir->operands[3]->type == glsl_type::int_type);
+      assert(ir->operands[2]->type == ir->type);
+      assert(ir->operands[3]->type == ir->type);
        break;
  
     case ir_quadop_vector:
diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp

index 3aa52db..277d473 100644 (file)
--- a/src/glsl/link_atomics.cpp
+++ b/src/glsl/link_atomics.cpp
@@ -83,16 +83,16 @@ namespace {
        const active_atomic_counter *const first = (active_atomic_counter *) a;
        const active_atomic_counter *const second = (active_atomic_counter *) b;
  
-      return int(first->var->data.atomic.offset) - int(second->var->data.atomic.offset);
+      return int(first->var->data.offset) - int(second->var->data.offset);
     }
  
     bool
     check_atomic_counters_overlap(const ir_variable *x, const ir_variable *y)
     {
-      return ((x->data.atomic.offset >= y->data.atomic.offset &&
-               x->data.atomic.offset < y->data.atomic.offset + y->type->atomic_size()) ||
-              (y->data.atomic.offset >= x->data.atomic.offset &&
-               y->data.atomic.offset < x->data.atomic.offset + x->type->atomic_size()));
+      return ((x->data.offset >= y->data.offset &&
+               x->data.offset < y->data.offset + y->type->atomic_size()) ||
+              (y->data.offset >= x->data.offset &&
+               y->data.offset < x->data.offset + x->type->atomic_size()));
     }
  
     void
@@ -158,7 +158,7 @@ namespace {
              ir_variable *var = node->as_variable();
  
              if (var && var->type->contains_atomic()) {
-               int offset = var->data.atomic.offset;
+               int offset = var->data.offset;
                 unsigned uniform_loc = var->data.location;
                 process_atomic_variable(var->type, prog, &uniform_loc,
                                         var, buffers, num_buffers, &offset, i);
@@ -185,7 +185,7 @@ namespace {
                 linker_error(prog, "Atomic counter %s declared at offset %d "
                              "which is already in use.",
                              buffers[i].counters[j].var->name,
-                            buffers[i].counters[j].var->data.atomic.offset);
+                            buffers[i].counters[j].var->data.offset);
              }
           }
        }
@@ -237,7 +237,7 @@ link_assign_atomic_counter_resources(struct gl_context *ctx,
              var->data.binding = i;
  
           storage->atomic_buffer_index = i;
-         storage->offset = var->data.atomic.offset;
+         storage->offset = var->data.offset;
           storage->array_stride = (var->type->is_array() ?
                                    var->type->without_array()->atomic_size() : 0);
           if (!var->type->is_matrix())
diff --git a/src/glsl/link_interface_blocks.cpp b/src/glsl/link_interface_blocks.cpp

index 936e2e0..64c30fe 100644 (file)
--- a/src/glsl/link_interface_blocks.cpp
+++ b/src/glsl/link_interface_blocks.cpp
@@ -30,100 +30,52 @@
  #include "glsl_symbol_table.h"
  #include "linker.h"
  #include "main/macros.h"
-#include "program/hash_table.h"
+#include "util/hash_table.h"
  
  
  namespace {
  
  /**
- * Information about a single interface block definition that we need to keep
- * track of in order to check linkage rules.
- *
- * Note: this class is expected to be short lived, so it doesn't make copies
- * of the strings it references; it simply borrows the pointers from the
- * ir_variable class.
- */
-struct interface_block_definition
-{
-   /**
-    * Extract an interface block definition from an ir_variable that
-    * represents either the interface instance (for named interfaces), or a
-    * member of the interface (for unnamed interfaces).
-    */
-   explicit interface_block_definition(ir_variable *var)
-      : var(var),
-        type(var->get_interface_type()),
-        instance_name(NULL)
-   {
-      if (var->is_interface_instance()) {
-         instance_name = var->name;
-      }
-      explicitly_declared = (var->data.how_declared != ir_var_declared_implicitly);
-   }
-   /**
-    * Interface block ir_variable
-    */
-   ir_variable *var;
-
-   /**
-    * Interface block type
-    */
-   const glsl_type *type;
-
-   /**
-    * For a named interface block, the instance name.  Otherwise NULL.
-    */
-   const char *instance_name;
-
-   /**
-    * True if this interface block was explicitly declared in the shader;
-    * false if it was an implicitly declared built-in interface block.
-    */
-   bool explicitly_declared;
-};
-
-
-/**
   * Check if two interfaces match, according to intrastage interface matching
   * rules.  If they do, and the first interface uses an unsized array, it will
   * be updated to reflect the array size declared in the second interface.
   */
  bool
-intrastage_match(interface_block_definition *a,
-                 const interface_block_definition *b,
-                 ir_variable_mode mode,
+intrastage_match(ir_variable *a,
+                 ir_variable *b,
                   struct gl_shader_program *prog)
  {
     /* Types must match. */
-   if (a->type != b->type) {
+   if (a->get_interface_type() != b->get_interface_type()) {
        /* Exception: if both the interface blocks are implicitly declared,
         * don't force their types to match.  They might mismatch due to the two
         * shaders using different GLSL versions, and that's ok.
         */
-      if (a->explicitly_declared || b->explicitly_declared)
+      if (a->data.how_declared != ir_var_declared_implicitly ||
+          b->data.how_declared != ir_var_declared_implicitly)
           return false;
     }
  
     /* Presence/absence of interface names must match. */
-   if ((a->instance_name == NULL) != (b->instance_name == NULL))
+   if (a->is_interface_instance() != b->is_interface_instance())
        return false;
  
     /* For uniforms, instance names need not match.  For shader ins/outs,
      * it's not clear from the spec whether they need to match, but
      * Mesa's implementation relies on them matching.
      */
-   if (a->instance_name != NULL &&
-       mode != ir_var_uniform && mode != ir_var_shader_storage &&
-       strcmp(a->instance_name, b->instance_name) != 0) {
+   if (a->is_interface_instance() && b->data.mode != ir_var_uniform &&
+       b->data.mode != ir_var_shader_storage &&
+       strcmp(a->name, b->name) != 0) {
        return false;
     }
  
     /* If a block is an array then it must match across the shader.
      * Unsized arrays are also processed and matched agaist sized arrays.
      */
-   if (b->var->type != a->var->type &&
-       (b->instance_name != NULL || a->instance_name != NULL) &&
-       !validate_intrastage_arrays(prog, b->var, a->var))
+   if (b->type != a->type &&
+       (b->is_interface_instance() || a->is_interface_instance()) &&
+       !validate_intrastage_arrays(prog, b, a))
        return false;
  
     return true;
@@ -139,43 +91,44 @@ intrastage_match(interface_block_definition *a,
   * This is used for tessellation control and geometry shader consumers.
   */
  bool
-interstage_match(const interface_block_definition *producer,
-                 const interface_block_definition *consumer,
+interstage_match(ir_variable *producer,
+                 ir_variable *consumer,
                   bool extra_array_level)
  {
     /* Unsized arrays should not occur during interstage linking.  They
      * should have all been assigned a size by link_intrastage_shaders.
      */
-   assert(!consumer->var->type->is_unsized_array());
-   assert(!producer->var->type->is_unsized_array());
+   assert(!consumer->type->is_unsized_array());
+   assert(!producer->type->is_unsized_array());
  
     /* Types must match. */
-   if (consumer->type != producer->type) {
+   if (consumer->get_interface_type() != producer->get_interface_type()) {
        /* Exception: if both the interface blocks are implicitly declared,
         * don't force their types to match.  They might mismatch due to the two
         * shaders using different GLSL versions, and that's ok.
         */
-      if (consumer->explicitly_declared || producer->explicitly_declared)
+      if (consumer->data.how_declared != ir_var_declared_implicitly ||
+          producer->data.how_declared != ir_var_declared_implicitly)
           return false;
     }
  
     /* Ignore outermost array if geom shader */
     const glsl_type *consumer_instance_type;
     if (extra_array_level) {
-      consumer_instance_type = consumer->var->type->fields.array;
+      consumer_instance_type = consumer->type->fields.array;
     } else {
-      consumer_instance_type = consumer->var->type;
+      consumer_instance_type = consumer->type;
     }
  
     /* If a block is an array then it must match across shaders.
      * Since unsized arrays have been ruled out, we can check this by just
      * making sure the types are equal.
      */
-   if ((consumer->instance_name != NULL &&
+   if ((consumer->is_interface_instance() &&
          consumer_instance_type->is_array()) ||
-       (producer->instance_name != NULL &&
-        producer->var->type->is_array())) {
-      if (consumer_instance_type != producer->var->type)
+       (producer->is_interface_instance() &&
+        producer->type->is_array())) {
+      if (consumer_instance_type != producer->type)
           return false;
     }
  
@@ -197,35 +150,55 @@ class interface_block_definitions
  public:
     interface_block_definitions()
        : mem_ctx(ralloc_context(NULL)),
-        ht(hash_table_ctor(0, hash_table_string_hash,
-                           hash_table_string_compare))
+        ht(_mesa_hash_table_create(NULL, _mesa_key_hash_string,
+                                   _mesa_key_string_equal))
     {
     }
  
     ~interface_block_definitions()
     {
-      hash_table_dtor(ht);
        ralloc_free(mem_ctx);
+      _mesa_hash_table_destroy(ht, NULL);
     }
  
     /**
-    * Lookup the interface definition having the given block name.  Return
-    * NULL if none is found.
+    * Lookup the interface definition. Return NULL if none is found.
      */
-   interface_block_definition *lookup(const char *block_name)
+   ir_variable *lookup(ir_variable *var)
     {
-      return (interface_block_definition *) hash_table_find(ht, block_name);
+      if (var->data.explicit_location &&
+          var->data.location >= VARYING_SLOT_VAR0) {
+         char location_str[11];
+         snprintf(location_str, 11, "%d", var->data.location);
+
+         const struct hash_entry *entry =
+            _mesa_hash_table_search(ht, location_str);
+         return entry ? (ir_variable *) entry->data : NULL;
+      } else {
+         const struct hash_entry *entry =
+            _mesa_hash_table_search(ht, var->get_interface_type()->name);
+         return entry ? (ir_variable *) entry->data : NULL;
+      }
     }
  
     /**
      * Add a new interface definition.
      */
-   void store(const interface_block_definition &def)
+   void store(ir_variable *var)
     {
-      interface_block_definition *hash_entry =
-         rzalloc(mem_ctx, interface_block_definition);
-      *hash_entry = def;
-      hash_table_insert(ht, hash_entry, def.type->name);
+      if (var->data.explicit_location &&
+          var->data.location >= VARYING_SLOT_VAR0) {
+         /* If explicit location is given then lookup the variable by location.
+          * We turn the location into a string and use this as the hash key
+          * rather than the name. Note: We allocate enough space for a 32-bit
+          * unsigned location value which is overkill but future proof.
+          */
+         char location_str[11];
+         snprintf(location_str, 11, "%d", var->data.location);
+         _mesa_hash_table_insert(ht, ralloc_strdup(mem_ctx, location_str), var);
+      } else {
+         _mesa_hash_table_insert(ht, var->get_interface_type()->name, var);
+      }
     }
  
  private:
@@ -236,8 +209,7 @@ private:
  
     /**
      * Hash table mapping interface block name to an \c
-    * interface_block_definition struct.  interface_block_definition structs
-    * are allocated using \c mem_ctx.
+    * ir_variable.
      */
     hash_table *ht;
  };
@@ -292,18 +264,13 @@ validate_intrastage_interface_blocks(struct gl_shader_program *prog,
              continue;
           }
  
-         const interface_block_definition def(var);
-         interface_block_definition *prev_def =
-            definitions->lookup(iface_type->name);
-
+         ir_variable *prev_def = definitions->lookup(var);
           if (prev_def == NULL) {
              /* This is the first time we've seen the interface, so save
               * it into the appropriate data structure.
               */
-            definitions->store(def);
-         } else if (!intrastage_match(prev_def, &def,
-                                      (ir_variable_mode) var->data.mode,
-                                      prog)) {
+            definitions->store(var);
+         } else if (!intrastage_match(prev_def, var, prog)) {
              linker_error(prog, "definitions of interface block `%s' do not"
                           " match\n", iface_type->name);
              return;
@@ -329,7 +296,7 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog,
        if (!var || !var->get_interface_type() || var->data.mode != ir_var_shader_in)
           continue;
  
-      definitions.store(interface_block_definition(var));
+      definitions.store(var);
     }
  
     /* Verify that the producer's output interfaces match. */
@@ -338,16 +305,13 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog,
        if (!var || !var->get_interface_type() || var->data.mode != ir_var_shader_out)
           continue;
  
-      interface_block_definition *consumer_def =
-         definitions.lookup(var->get_interface_type()->name);
+      ir_variable *consumer_def = definitions.lookup(var);
  
        /* The consumer doesn't use this output block.  Ignore it. */
        if (consumer_def == NULL)
           continue;
  
-      const interface_block_definition producer_def(var);
-
-      if (!interstage_match(&producer_def, consumer_def, extra_array_level)) {
+      if (!interstage_match(var, consumer_def, extra_array_level)) {
           linker_error(prog, "definitions of interface block `%s' do not "
                        "match\n", var->get_interface_type()->name);
           return;
@@ -374,19 +338,15 @@ validate_interstage_uniform_blocks(struct gl_shader_program *prog,
                var->data.mode != ir_var_shader_storage))
              continue;
  
-         interface_block_definition *old_def =
-            definitions.lookup(var->get_interface_type()->name);
-         const interface_block_definition new_def(var);
+         ir_variable *old_def = definitions.lookup(var);
           if (old_def == NULL) {
-            definitions.store(new_def);
+            definitions.store(var);
           } else {
              /* Interstage uniform matching rules are the same as intrastage
               * uniform matchin rules (for uniforms, it is as though all
               * shaders are in the same shader stage).
               */
-            if (!intrastage_match(old_def, &new_def,
-                                  (ir_variable_mode) var->data.mode,
-                                  prog)) {
+            if (!intrastage_match(old_def, var, prog)) {
                 linker_error(prog, "definitions of interface block `%s' do not "
                              "match\n", var->get_interface_type()->name);
                 return;
diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp

index 422739a..54fea70 100644 (file)
--- a/src/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/glsl/link_uniform_block_active_visitor.cpp
@@ -24,7 +24,7 @@
  #include "link_uniform_block_active_visitor.h"
  #include "program.h"
  
-link_uniform_block_active *
+static link_uniform_block_active *
  process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
  {
     const hash_entry *const existing_block =
@@ -92,7 +92,7 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
   * and not over complicating the code we will end up with a count of 8.
   * Here each dimension has 2 different indices counted so we end up with 2*2*2
   */
-struct uniform_block_array_elements **
+static struct uniform_block_array_elements **
  process_arrays(void *mem_ctx, ir_dereference_array *ir,
                 struct link_uniform_block_active *block)
  {
diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp

index d5d30bb..7d75576 100644 (file)
--- a/src/glsl/link_uniform_blocks.cpp
+++ b/src/glsl/link_uniform_blocks.cpp
@@ -266,7 +266,7 @@ process_block_array(struct uniform_block_array_elements *ub_array, char **name,
  /* This function resizes the array types of the block so that later we can use
   * this new size to correctly calculate the offest for indirect indexing.
   */
-const glsl_type *
+static const glsl_type *
  resize_block_array(const glsl_type *type,
                     struct uniform_block_array_elements *ub_array)
  {
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp

index 58d21e5..cdc1d3a 100644 (file)
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -88,6 +88,7 @@ copy_constant_to_storage(union gl_constant_value *storage,
        case GLSL_TYPE_IMAGE:
        case GLSL_TYPE_ATOMIC_UINT:
        case GLSL_TYPE_INTERFACE:
+      case GLSL_TYPE_FUNCTION:
        case GLSL_TYPE_VOID:
        case GLSL_TYPE_SUBROUTINE:
        case GLSL_TYPE_ERROR:
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp

index 47bb771..33b2d4c 100644 (file)
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -532,6 +532,8 @@ public:
            */
           if (var->is_interface_instance()) {
              ubo_byte_offset = 0;
+            process(var->get_interface_type(),
+                    var->get_interface_type()->name);
           } else {
              const struct gl_uniform_block *const block =
                 &prog->BufferInterfaceBlocks[ubo_block_index];
@@ -542,13 +544,8 @@ public:
                 &block->Uniforms[var->data.location];
  
              ubo_byte_offset = ubo_var->Offset;
-         }
-
-         if (var->is_interface_instance())
-            process(var->get_interface_type(),
-                    var->get_interface_type()->name);
-         else
              process(var);
+         }
        } else {
           /* Store any explicit location and reset data location so we can
            * reuse this variable for storing the uniform slot number.
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp

index 71750d1..7cc5880 100644 (file)
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -41,6 +41,29 @@
  
  
  /**
+ * Get the varying type stripped of the outermost array if we're processing
+ * a stage whose varyings are arrays indexed by a vertex number (such as
+ * geometry shader inputs).
+ */
+static const glsl_type *
+get_varying_type(const ir_variable *var, gl_shader_stage stage)
+{
+   const glsl_type *type = var->type;
+
+   if (!var->data.patch &&
+       ((var->data.mode == ir_var_shader_out &&
+         stage == MESA_SHADER_TESS_CTRL) ||
+        (var->data.mode == ir_var_shader_in &&
+         (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_TESS_EVAL ||
+          stage == MESA_SHADER_GEOMETRY)))) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   return type;
+}
+
+/**
   * Validate the types and qualifiers of an output from one stage against the
   * matching input to another stage.
   */
@@ -309,6 +332,41 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
     }
  }
  
+/**
+ * Demote shader inputs and outputs that are not used in other stages, and
+ * remove them via dead code elimination.
+ */
+void
+remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
+                                        gl_shader *sh,
+                                        enum ir_variable_mode mode)
+{
+   if (is_separate_shader_object)
+      return;
+
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *const var = node->as_variable();
+
+      if ((var == NULL) || (var->data.mode != int(mode)))
+        continue;
+
+      /* A shader 'in' or 'out' variable is only really an input or output if
+       * its value is used by other shader stages. This will cause the
+       * variable to have a location assigned.
+       */
+      if (var->data.is_unmatched_generic_inout) {
+         assert(var->data.mode != ir_var_temporary);
+        var->data.mode = ir_var_auto;
+      }
+   }
+
+   /* Eliminate code that is now dead due to unused inputs/outputs being
+    * demoted.
+    */
+   while (do_dead_code(sh->ir, false))
+      ;
+
+}
  
  /**
   * Initialize this object based on a string that was passed to
@@ -432,6 +490,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
           this->matched_candidate->type->fields.array->matrix_columns;
        const unsigned vector_elements =
           this->matched_candidate->type->fields.array->vector_elements;
+      const unsigned dmul =
+         this->matched_candidate->type->fields.array->is_double() ? 2 : 1;
        unsigned actual_array_size;
        switch (this->lowered_builtin_array_variable) {
        case clip_distance:
@@ -459,7 +519,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
              return false;
           }
           unsigned array_elem_size = this->lowered_builtin_array_variable ?
-            1 : vector_elements * matrix_cols;
+            1 : vector_elements * matrix_cols * dmul;
           fine_location += array_elem_size * this->array_subscript;
           this->size = 1;
        } else {
@@ -519,7 +579,6 @@ tfeedback_decl::get_num_outputs() const
     if (!this->is_varying()) {
        return 0;
     }
-
     return (this->num_components() + this->location_frac + 3)/4;
  }
  
@@ -766,7 +825,8 @@ public:
                     gl_shader_stage consumer_stage);
     ~varying_matches();
     void record(ir_variable *producer_var, ir_variable *consumer_var);
-   unsigned assign_locations(uint64_t reserved_slots, bool separate_shader);
+   unsigned assign_locations(struct gl_shader_program *prog,
+                             uint64_t reserved_slots, bool separate_shader);
     void store_locations() const;
  
  private:
@@ -945,30 +1005,14 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
     this->matches[this->num_matches].packing_order
        = this->compute_packing_order(var);
     if (this->disable_varying_packing) {
-      const struct glsl_type *type = var->type;
        unsigned slots;
+      gl_shader_stage stage =
+         (producer_var != NULL) ? producer_stage : consumer_stage;
  
-      /* Some shader stages have 2-dimensional varyings. Use the inner type. */
-      if (!var->data.patch &&
-          ((var == producer_var && producer_stage == MESA_SHADER_TESS_CTRL) ||
-           (var == consumer_var && (consumer_stage == MESA_SHADER_TESS_CTRL ||
-                                    consumer_stage == MESA_SHADER_TESS_EVAL ||
-                                    consumer_stage == MESA_SHADER_GEOMETRY)))) {
-         assert(type->is_array());
-         type = type->fields.array;
-      }
+      const glsl_type *type = get_varying_type(var, stage);
  
-      if (type->is_array()) {
-         slots = 1;
-         while (type->is_array()) {
-            slots *= type->length;
-            type = type->fields.array;
-         }
-         slots *= type->matrix_columns;
-      } else {
-         slots = type->matrix_columns;
-      }
-      this->matches[this->num_matches].num_components = 4 * slots;
+      slots = type->count_attribute_slots(false);
+      this->matches[this->num_matches].num_components = slots * 4;
     } else {
        this->matches[this->num_matches].num_components
           = var->type->component_slots();
@@ -988,7 +1032,9 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
   * passed to varying_matches::record().
   */
  unsigned
-varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
+varying_matches::assign_locations(struct gl_shader_program *prog,
+                                  uint64_t reserved_slots,
+                                  bool separate_shader)
  {
     /* We disable varying sorting for separate shader programs for the
      * following reasons:
@@ -1025,10 +1071,20 @@ varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
     for (unsigned i = 0; i < this->num_matches; i++) {
        unsigned *location = &generic_location;
  
-      if ((this->matches[i].consumer_var &&
-           this->matches[i].consumer_var->data.patch) ||
-          (this->matches[i].producer_var &&
-           this->matches[i].producer_var->data.patch))
+      const ir_variable *var;
+      const glsl_type *type;
+      bool is_vertex_input = false;
+      if (matches[i].consumer_var) {
+         var = matches[i].consumer_var;
+         type = get_varying_type(var, consumer_stage);
+         if (consumer_stage == MESA_SHADER_VERTEX)
+            is_vertex_input = true;
+      } else {
+         var = matches[i].producer_var;
+         type = get_varying_type(var, producer_stage);
+      }
+
+      if (var->data.patch)
           location = &generic_patch_location;
  
        /* Advance to the next slot if this varying has a different packing
@@ -1040,9 +1096,45 @@ varying_matches::assign_locations(uint64_t reserved_slots, bool separate_shader)
            != this->matches[i].packing_class) {
           *location = ALIGN(*location, 4);
        }
-      while ((*location < MAX_VARYING * 4u) &&
-            (reserved_slots & (1u << *location / 4u))) {
-         *location = ALIGN(*location + 1, 4);
+
+      unsigned num_elements =  type->count_attribute_slots(is_vertex_input);
+      unsigned slot_end = this->disable_varying_packing ? 4 :
+         type->without_array()->vector_elements;
+      slot_end += *location - 1;
+
+      /* FIXME: We could be smarter in the below code and loop back over
+       * trying to fill any locations that we skipped because we couldn't pack
+       * the varying between an explicit location. For now just let the user
+       * hit the linking error if we run out of room and suggest they use
+       * explicit locations.
+       */
+      for (unsigned j = 0; j < num_elements; j++) {
+         while ((slot_end < MAX_VARYING * 4u) &&
+                ((reserved_slots & (UINT64_C(1) << *location / 4u) ||
+                 (reserved_slots & (UINT64_C(1) << slot_end / 4u))))) {
+
+            *location = ALIGN(*location + 1, 4);
+            slot_end = *location;
+
+            /* reset the counter and try again */
+            j = 0;
+         }
+
+         /* Increase the slot to make sure there is enough room for next
+          * array element.
+          */
+         if (this->disable_varying_packing)
+            slot_end += 4;
+         else
+            slot_end += type->without_array()->vector_elements;
+      }
+
+      if (!var->data.patch && *location >= MAX_VARYING * 4u) {
+         linker_error(prog, "insufficient contiguous locations available for "
+                      "%s it is possible an array or struct could not be "
+                      "packed between varyings with explicit locations. Try "
+                      "using an explicit location for arrays and structs.",
+                      var->name);
        }
  
        this->matches[i].generic_location = *location;
@@ -1203,13 +1295,12 @@ public:
  
     void process(ir_variable *var)
     {
+      /* All named varying interface blocks should be flattened by now */
+      assert(!var->is_interface_instance());
+
        this->toplevel_var = var;
        this->varying_floats = 0;
-      if (var->is_interface_instance())
-         program_resource_visitor::process(var->get_interface_type(),
-                                           var->get_interface_type()->name);
-      else
-         program_resource_visitor::process(var);
+      program_resource_visitor::process(var);
     }
  
  private:
@@ -1426,12 +1517,20 @@ reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode)
     foreach_in_list(ir_instruction, node, stage->ir) {
        ir_variable *const var = node->as_variable();
  
-      if (var == NULL || var->data.mode != io_mode || !var->data.explicit_location)
+      if (var == NULL || var->data.mode != io_mode ||
+          !var->data.explicit_location ||
+          var->data.location < VARYING_SLOT_VAR0)
           continue;
  
        var_slot = var->data.location - VARYING_SLOT_VAR0;
-      if (var_slot >= 0 && var_slot < MAX_VARYING)
-         slots |= 1u << var_slot;
+
+      unsigned num_elements = get_varying_type(var, stage->Stage)
+         ->count_attribute_slots(stage->Stage == MESA_SHADER_VERTEX);
+      for (unsigned i = 0; i < num_elements; i++) {
+         if (var_slot >= 0 && var_slot < MAX_VARYING)
+            slots |= UINT64_C(1) << var_slot;
+         var_slot += 1;
+      }
     }
  
     return slots;
@@ -1617,7 +1716,7 @@ assign_varying_locations(struct gl_context *ctx,
        reserved_varying_slot(producer, ir_var_shader_out) |
        reserved_varying_slot(consumer, ir_var_shader_in);
  
-   const unsigned slots_used = matches.assign_locations(reserved_slots,
+   const unsigned slots_used = matches.assign_locations(prog, reserved_slots,
                                                          prog->SeparateShader);
     matches.store_locations();
  
@@ -1637,17 +1736,6 @@ assign_varying_locations(struct gl_context *ctx,
     hash_table_dtor(consumer_inputs);
     hash_table_dtor(consumer_interface_inputs);
  
-   if (!disable_varying_packing) {
-      if (producer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
-                               0, producer);
-      }
-      if (consumer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
-                               consumer_vertices, consumer);
-      }
-   }
-
     if (consumer && producer) {
        foreach_in_list(ir_instruction, node, consumer->ir) {
           ir_variable *const var = node->as_variable();
@@ -1688,13 +1776,29 @@ assign_varying_locations(struct gl_context *ctx,
                             var->name,
                              _mesa_shader_stage_to_string(producer->Stage));
              }
-
-            /* An 'in' variable is only really a shader input if its
-             * value is written by the previous stage.
-             */
-            var->data.mode = ir_var_auto;
           }
        }
+
+      /* Now that validation is done its safe to remove unused varyings. As
+       * we have both a producer and consumer its safe to remove unused
+       * varyings even if the program is a SSO because the stages are being
+       * linked together i.e. we have a multi-stage SSO.
+       */
+      remove_unused_shader_inputs_and_outputs(false, producer,
+                                              ir_var_shader_out);
+      remove_unused_shader_inputs_and_outputs(false, consumer,
+                                              ir_var_shader_in);
+   }
+
+   if (!disable_varying_packing) {
+      if (producer) {
+         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
+                               0, producer);
+      }
+      if (consumer) {
+         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
+                               consumer_vertices, consumer);
+      }
     }
  
     return true;
@@ -1712,7 +1816,8 @@ check_against_output_limit(struct gl_context *ctx,
  
        if (var && var->data.mode == ir_var_shader_out &&
            var_counts_against_varying_limit(producer->Stage, var)) {
-         output_vectors += var->type->count_attribute_slots();
+         /* outputs for fragment shader can't be doubles */
+         output_vectors += var->type->count_attribute_slots(false);
        }
     }
  
@@ -1753,7 +1858,8 @@ check_against_input_limit(struct gl_context *ctx,
  
        if (var && var->data.mode == ir_var_shader_in &&
            var_counts_against_varying_limit(consumer->Stage, var)) {
-         input_vectors += var->type->count_attribute_slots();
+         /* vertex inputs aren't varying counted */
+         input_vectors += var->type->count_attribute_slots(false);
        }
     }
  
diff --git a/src/glsl/link_varyings.h b/src/glsl/link_varyings.h

index 2ce72d4..b281261 100644 (file)
--- a/src/glsl/link_varyings.h
+++ b/src/glsl/link_varyings.h
@@ -131,7 +131,8 @@ public:
        if (this->lowered_builtin_array_variable)
           return this->size;
        else
-         return this->vector_elements * this->matrix_columns * this->size;
+         return this->vector_elements * this->matrix_columns * this->size *
+            (this->is_double() ? 2 : 1);
     }
  
     unsigned get_location() const {
@@ -139,6 +140,29 @@ public:
     }
  
  private:
+
+   bool is_double() const
+   {
+      switch (this->type) {
+      case GL_DOUBLE:
+      case GL_DOUBLE_VEC2:
+      case GL_DOUBLE_VEC3:
+      case GL_DOUBLE_VEC4:
+      case GL_DOUBLE_MAT2:
+      case GL_DOUBLE_MAT2x3:
+      case GL_DOUBLE_MAT2x4:
+      case GL_DOUBLE_MAT3:
+      case GL_DOUBLE_MAT3x2:
+      case GL_DOUBLE_MAT3x4:
+      case GL_DOUBLE_MAT4:
+      case GL_DOUBLE_MAT4x2:
+      case GL_DOUBLE_MAT4x3:
+         return true;
+      default:
+         return false;
+      }
+   }
+
     /**
      * The name that was supplied to glTransformFeedbackVaryings.  Used for
      * error reporting and glGetTransformFeedbackVarying().
@@ -244,6 +268,11 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
                        const void *mem_ctx, unsigned num_names,
                        char **varying_names, tfeedback_decl *decls);
  
+void
+remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
+                                        gl_shader *sh,
+                                        enum ir_variable_mode mode);
+
  bool
  store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                       unsigned num_tfeedback_decls,
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp

index b39c7f5..564c471 100644 (file)
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1014,7 +1014,7 @@ cross_validate_globals(struct gl_shader_program *prog,
              }
  
              if (var->type->contains_atomic() &&
-                var->data.atomic.offset != existing->data.atomic.offset) {
+                var->data.offset != existing->data.offset) {
                 linker_error(prog, "offset specifications for %s "
                              "`%s' have differing values\n",
                              mode_string(var), var->name);
@@ -1133,6 +1133,12 @@ cross_validate_globals(struct gl_shader_program *prog,
                              mode_string(var), var->name);
                 return;
              }
+            if (existing->data.image_format != var->data.image_format) {
+               linker_error(prog, "declarations for %s `%s` have "
+                            "mismatching image format qualifiers\n",
+                            mode_string(var), var->name);
+               return;
+            }
          } else
             variables.add_variable(var);
        }
@@ -2466,7 +2472,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
           return false;
        }
  
-      const unsigned slots = var->type->count_attribute_slots();
+      const unsigned slots = var->type->count_attribute_slots(target_index == MESA_SHADER_VERTEX ? true : false);
  
        /* If the variable is not a built-in and has a location statically
         * assigned in the shader (presumably via a layout qualifier), make sure
@@ -2603,17 +2609,8 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
               * issue (3) of the GL_ARB_vertex_attrib_64bit behavior, this
               * is optional behavior, but it seems preferable.
               */
-            const glsl_type *type = var->type->without_array();
-            if (type == glsl_type::dvec3_type ||
-                type == glsl_type::dvec4_type ||
-                type == glsl_type::dmat2x3_type ||
-                type == glsl_type::dmat2x4_type ||
-                type == glsl_type::dmat3_type ||
-                type == glsl_type::dmat3x4_type ||
-                type == glsl_type::dmat4x3_type ||
-                type == glsl_type::dmat4_type) {
+            if (var->type->without_array()->is_dual_slot_double())
                 double_storage_locations |= (use_mask << attr);
-            }
          }
  
          continue;
@@ -2732,30 +2729,6 @@ match_explicit_outputs_to_inputs(struct gl_shader_program *prog,
  }
  
  /**
- * Demote shader inputs and outputs that are not used in other stages
- */
-void
-demote_shader_inputs_and_outputs(gl_shader *sh, enum ir_variable_mode mode)
-{
-   foreach_in_list(ir_instruction, node, sh->ir) {
-      ir_variable *const var = node->as_variable();
-
-      if ((var == NULL) || (var->data.mode != int(mode)))
-        continue;
-
-      /* A shader 'in' or 'out' variable is only really an input or output if
-       * its value is used by other shader stages.  This will cause the variable
-       * to have a location assigned.
-       */
-      if (var->data.is_unmatched_generic_inout) {
-         assert(var->data.mode != ir_var_temporary);
-        var->data.mode = ir_var_auto;
-      }
-   }
-}
-
-
-/**
   * Store the gl_FragDepth layout in the gl_shader_program struct.
   */
  static void
@@ -3004,7 +2977,8 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
              foreach_in_list(ir_instruction, node, sh->ir) {
                 ir_variable *var = node->as_variable();
                 if (var && var->data.mode == ir_var_shader_out)
-                  fragment_outputs += var->type->count_attribute_slots();
+                  /* since there are no double fs outputs - pass false */
+                  fragment_outputs += var->type->count_attribute_slots(false);
              }
           }
        }
@@ -3162,6 +3136,7 @@ check_explicit_uniform_locations(struct gl_context *ctx,
        return;
     }
  
+   unsigned entries_total = 0;
     for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
        struct gl_shader *sh = prog->_LinkedShaders[i];
  
@@ -3170,8 +3145,12 @@ check_explicit_uniform_locations(struct gl_context *ctx,
  
        foreach_in_list(ir_instruction, node, sh->ir) {
           ir_variable *var = node->as_variable();
-         if (var && (var->data.mode == ir_var_uniform &&
-                     var->data.explicit_location)) {
+         if (!var || var->data.mode != ir_var_uniform)
+            continue;
+
+         entries_total += var->type->uniform_locations();
+
+         if (var->data.explicit_location) {
              bool ret;
              if (var->type->is_subroutine())
                 ret = reserve_subroutine_explicit_locations(prog, sh, var);
@@ -3185,6 +3164,14 @@ check_explicit_uniform_locations(struct gl_context *ctx,
        }
     }
  
+   /* Verify that total amount of entries for explicit and implicit locations
+    * is less than MAX_UNIFORM_LOCATIONS.
+    */
+   if (entries_total >= ctx->Const.MaxUserAssignableUniformLocations) {
+      linker_error(prog, "count of uniform locations >= MAX_UNIFORM_LOCATIONS"
+                   "(%u >= %u)", entries_total,
+                   ctx->Const.MaxUserAssignableUniformLocations);
+   }
     delete uniform_map;
  }
  
@@ -3381,6 +3368,30 @@ build_stageref(struct gl_shader_program *shProg, const char *name,
     return stages;
  }
  
+/**
+ * Create gl_shader_variable from ir_variable class.
+ */
+static gl_shader_variable *
+create_shader_variable(struct gl_shader_program *shProg, const ir_variable *in)
+{
+   gl_shader_variable *out = ralloc(shProg, struct gl_shader_variable);
+   if (!out)
+      return NULL;
+
+   out->type = in->type;
+   out->name = ralloc_strdup(shProg, in->name);
+
+   if (!out->name)
+      return NULL;
+
+   out->location = in->data.location;
+   out->index = in->data.index;
+   out->patch = in->data.patch;
+   out->mode = in->data.mode;
+
+   return out;
+}
+
  static bool
  add_interface_variables(struct gl_shader_program *shProg,
                          exec_list *ir, GLenum programInterface)
@@ -3432,16 +3443,20 @@ add_interface_variables(struct gl_shader_program *shProg,
        if (strncmp(var->name, "gl_out_FragData", 15) == 0)
           continue;
  
-      if (!add_program_resource(shProg, programInterface, var,
-                                build_stageref(shProg, var->name,
-                                               var->data.mode) | mask))
+      gl_shader_variable *sha_v = create_shader_variable(shProg, var);
+      if (!sha_v)
+         return false;
+
+      if (!add_program_resource(shProg, programInterface, sha_v,
+                                build_stageref(shProg, sha_v->name,
+                                               sha_v->mode) | mask))
           return false;
     }
     return true;
  }
  
  static bool
-add_packed_varyings(struct gl_shader_program *shProg, int stage)
+add_packed_varyings(struct gl_shader_program *shProg, int stage, GLenum type)
  {
     struct gl_shader *sh = shProg->_LinkedShaders[stage];
     GLenum iface;
@@ -3462,10 +3477,16 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage)
           default:
              unreachable("unexpected type");
           }
-         if (!add_program_resource(shProg, iface, var,
-                                   build_stageref(shProg, var->name,
-                                                  var->data.mode)))
-            return false;
+
+         if (type == iface) {
+            gl_shader_variable *sha_v = create_shader_variable(shProg, var);
+            if (!sha_v)
+               return false;
+            if (!add_program_resource(shProg, iface, sha_v,
+                                      build_stageref(shProg, sha_v->name,
+                                                     sha_v->mode)))
+               return false;
+         }
        }
     }
     return true;
@@ -3483,7 +3504,10 @@ add_fragdata_arrays(struct gl_shader_program *shProg)
        ir_variable *var = node->as_variable();
        if (var) {
           assert(var->data.mode == ir_var_shader_out);
-         if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, var,
+         gl_shader_variable *sha_v = create_shader_variable(shProg, var);
+         if (!sha_v)
+            return false;
+         if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, sha_v,
                                     1 << MESA_SHADER_FRAGMENT))
              return false;
        }
@@ -3732,9 +3756,10 @@ build_program_resource_list(struct gl_shader_program *shProg)
  
     /* Program interface needs to expose varyings in case of SSO. */
     if (shProg->SeparateShader) {
-      if (!add_packed_varyings(shProg, input_stage))
+      if (!add_packed_varyings(shProg, input_stage, GL_PROGRAM_INPUT))
           return;
-      if (!add_packed_varyings(shProg, output_stage))
+
+      if (!add_packed_varyings(shProg, output_stage, GL_PROGRAM_OUTPUT))
           return;
     }
  
@@ -3950,8 +3975,10 @@ split_ubos_and_ssbos(void *mem_ctx,
                       unsigned num_blocks,
                       struct gl_uniform_block ***ubos,
                       unsigned *num_ubos,
+                     unsigned **ubo_interface_block_indices,
                       struct gl_uniform_block ***ssbos,
-                     unsigned *num_ssbos)
+                     unsigned *num_ssbos,
+                     unsigned **ssbo_interface_block_indices)
  {
     unsigned num_ubo_blocks = 0;
     unsigned num_ssbo_blocks = 0;
@@ -3969,11 +3996,25 @@ split_ubos_and_ssbos(void *mem_ctx,
     *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
     *num_ssbos = 0;
  
+   if (ubo_interface_block_indices)
+      *ubo_interface_block_indices =
+         ralloc_array(mem_ctx, unsigned, num_ubo_blocks);
+
+   if (ssbo_interface_block_indices)
+      *ssbo_interface_block_indices =
+         ralloc_array(mem_ctx, unsigned, num_ssbo_blocks);
+
     for (unsigned i = 0; i < num_blocks; i++) {
        if (blocks[i].IsShaderStorage) {
-         (*ssbos)[(*num_ssbos)++] = &blocks[i];
+         (*ssbos)[*num_ssbos] = &blocks[i];
+         if (ssbo_interface_block_indices)
+            (*ssbo_interface_block_indices)[*num_ssbos] = i;
+         (*num_ssbos)++;
        } else {
-         (*ubos)[(*num_ubos)++] = &blocks[i];
+         (*ubos)[*num_ubos] = &blocks[i];
+         if (ubo_interface_block_indices)
+            (*ubo_interface_block_indices)[*num_ubos] = i;
+         (*num_ubos)++;
        }
     }
  
@@ -4451,14 +4492,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
        do_dead_builtin_varyings(ctx, sh, NULL,
                                 num_tfeedback_decls, tfeedback_decls);
  
-      if (!prog->SeparateShader) {
-         demote_shader_inputs_and_outputs(sh, ir_var_shader_out);
-         /* Eliminate code that is now dead due to unused outputs being
-          * demoted.
-          */
-         while (do_dead_code(sh->ir, false))
-            ;
-      }
+      remove_unused_shader_inputs_and_outputs(prog->SeparateShader, sh,
+                                              ir_var_shader_out);
     }
     else if (first == MESA_SHADER_FRAGMENT) {
        /* If the program only contains a fragment shader...
@@ -4476,12 +4511,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                                         NULL /* tfeedback_decls */))
              goto done;
        } else {
-         demote_shader_inputs_and_outputs(sh, ir_var_shader_in);
-         /* Eliminate code that is now dead due to unused inputs being
-          * demoted.
-          */
-         while (do_dead_code(sh->ir, false))
-            ;
+         remove_unused_shader_inputs_and_outputs(false, sh,
+                                                 ir_var_shader_in);
        }
     }
  
@@ -4502,16 +4533,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                  next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
                  tfeedback_decls);
  
-      demote_shader_inputs_and_outputs(sh_i, ir_var_shader_out);
-      demote_shader_inputs_and_outputs(sh_next, ir_var_shader_in);
-
-      /* Eliminate code that is now dead due to unused outputs being demoted.
-       */
-      while (do_dead_code(sh_i->ir, false))
-         ;
-      while (do_dead_code(sh_next->ir, false))
-         ;
-
        /* This must be done after all dead varyings are eliminated. */
        if (!check_against_output_limit(ctx, prog, sh_i))
           goto done;
@@ -4585,8 +4606,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                                sh->NumBufferInterfaceBlocks,
                                &sh->UniformBlocks,
                                &sh->NumUniformBlocks,
+                              NULL,
                                &sh->ShaderStorageBlocks,
-                              &sh->NumShaderStorageBlocks);
+                              &sh->NumShaderStorageBlocks,
+                              NULL);
        }
     }
  
@@ -4595,8 +4618,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                          prog->NumBufferInterfaceBlocks,
                          &prog->UniformBlocks,
                          &prog->NumUniformBlocks,
+                        &prog->UboInterfaceBlockIndex,
                          &prog->ShaderStorageBlocks,
-                        &prog->NumShaderStorageBlocks);
+                        &prog->NumShaderStorageBlocks,
+                        &prog->SsboInterfaceBlockIndex);
  
     /* FINISHME: Assign fragment shader output locations. */
  
diff --git a/src/glsl/lower_instructions.cpp b/src/glsl/lower_instructions.cpp

index 845cfff..d140be3 100644 (file)
--- a/src/glsl/lower_instructions.cpp
+++ b/src/glsl/lower_instructions.cpp
@@ -39,7 +39,6 @@
   * - MOD_TO_FLOOR
   * - LDEXP_TO_ARITH
   * - DFREXP_TO_ARITH
- * - BITFIELD_INSERT_TO_BFM_BFI
   * - CARRY_TO_ARITH
   * - BORROW_TO_ARITH
   * - SAT_TO_CLAMP
@@ -99,14 +98,6 @@
   * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
   * arithmetic and bit ops for double arguments.
   *
- * BITFIELD_INSERT_TO_BFM_BFI:
- * ---------------------------
- * Breaks ir_quadop_bitfield_insert into ir_binop_bfm (bitfield mask) and
- * ir_triop_bfi (bitfield insert).
- *
- * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
- * with a pair of instructions.
- *
   * CARRY_TO_ARITH:
   * ---------------
   * Converts ir_carry into (x + y) < x.
@@ -154,7 +145,6 @@ private:
     void exp_to_exp2(ir_expression *);
     void pow_to_exp2(ir_expression *);
     void log_to_log2(ir_expression *);
-   void bitfield_insert_to_bfm_bfi(ir_expression *);
     void ldexp_to_arith(ir_expression *);
     void dldexp_to_arith(ir_expression *);
     void dfrexp_sig_to_arith(ir_expression *);
@@ -348,29 +338,6 @@ lower_instructions_visitor::mod_to_floor(ir_expression *ir)
  }
  
  void
-lower_instructions_visitor::bitfield_insert_to_bfm_bfi(ir_expression *ir)
-{
-   /* Translates
-    *    ir_quadop_bitfield_insert base insert offset bits
-    * into
-    *    ir_triop_bfi (ir_binop_bfm bits offset) insert base
-    */
-
-   ir_rvalue *base_expr = ir->operands[0];
-
-   ir->operation = ir_triop_bfi;
-   ir->operands[0] = new(ir) ir_expression(ir_binop_bfm,
-                                           ir->type->get_base_type(),
-                                           ir->operands[3],
-                                           ir->operands[2]);
-   /* ir->operands[1] is still the value to insert. */
-   ir->operands[2] = base_expr;
-   ir->operands[3] = NULL;
-
-   this->progress = true;
-}
-
-void
  lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
  {
     /* Translates
@@ -414,8 +381,8 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
  
     ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
  
-   ir_constant *exp_shift = new(ir) ir_constant(23);
-   ir_constant *exp_width = new(ir) ir_constant(8);
+   ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
+   ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
  
     /* Temporary variables */
     ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
@@ -482,12 +449,6 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
                                       exp_shift_clone, exp_width);
     ir->operands[1] = NULL;
  
-   /* Don't generate new IR that would need to be lowered in an additional
-    * pass.
-    */
-   if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
-      bitfield_insert_to_bfm_bfi(ir->operands[0]->as_expression());
-
     this->progress = true;
  }
  
@@ -509,8 +470,8 @@ lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
  
     ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
  
-   ir_constant *exp_shift = new(ir) ir_constant(20);
-   ir_constant *exp_width = new(ir) ir_constant(11);
+   ir_constant *exp_shift = new(ir) ir_constant(20, vec_elem);
+   ir_constant *exp_width = new(ir) ir_constant(11, vec_elem);
     ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
  
     /* Temporary variables */
@@ -602,9 +563,6 @@ lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
              exp_shift->clone(ir, NULL),
              exp_width->clone(ir, NULL));
  
-      if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
-         bitfield_insert_to_bfm_bfi(bfi);
-
        i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
  
        results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
@@ -1039,11 +997,6 @@ lower_instructions_visitor::visit_leave(ir_expression *ir)
          pow_to_exp2(ir);
        break;
  
-   case ir_quadop_bitfield_insert:
-      if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
-         bitfield_insert_to_bfm_bfi(ir);
-      break;
-
     case ir_binop_ldexp:
        if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
           ldexp_to_arith(ir);
diff --git a/src/glsl/lower_mat_op_to_vec.cpp b/src/glsl/lower_mat_op_to_vec.cpp

index dda754f..e96cda2 100644 (file)
--- a/src/glsl/lower_mat_op_to_vec.cpp
+++ b/src/glsl/lower_mat_op_to_vec.cpp
@@ -277,7 +277,11 @@ ir_mat_op_to_vec_visitor::do_equal_mat_mat(ir_dereference *result,
     }
  
     ir_rvalue *const val = new(this->mem_ctx) ir_dereference_variable(tmp_bvec);
-   ir_expression *any = new(this->mem_ctx) ir_expression(ir_unop_any, val);
+   uint8_t vec_elems = val->type->vector_elements;
+   ir_expression *any =
+      new(this->mem_ctx) ir_expression(ir_binop_any_nequal, val,
+                                       new(this->mem_ctx) ir_constant(false,
+                                                                      vec_elems));
  
     if (test_equal)
        any = new(this->mem_ctx) ir_expression(ir_unop_logic_not, any);
diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp

index c8bf68b..7f18238 100644 (file)
--- a/src/glsl/lower_packing_builtins.cpp
+++ b/src/glsl/lower_packing_builtins.cpp
@@ -230,8 +230,8 @@ private:
        if (op_mask & LOWER_PACK_USE_BFI) {
           return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
                                  swizzle_y(u),
-                                constant(16),
-                                constant(16));
+                                constant(16u),
+                                constant(16u));
        }
  
        /* return (u.y << 16) | (u.x & 0xffff); */
@@ -261,9 +261,9 @@ private:
           return bitfield_insert(bitfield_insert(
                                     bitfield_insert(
                                        bit_and(swizzle_x(u), constant(0xffu)),
-                                      swizzle_y(u), constant(8), constant(8)),
-                                   swizzle_z(u), constant(16), constant(8)),
-                                swizzle_w(u), constant(24), constant(8));
+                                      swizzle_y(u), constant(8u), constant(8u)),
+                                   swizzle_z(u), constant(16u), constant(8u)),
+                                swizzle_w(u), constant(24u), constant(8u));
        }
  
        /* uvec4 u = UVEC4_RVAL & 0xff */
@@ -365,11 +365,11 @@ private:
  
        if (op_mask & LOWER_PACK_USE_BFE) {
           /* u4.y = bitfield_extract(u, 8, 8); */
-         factory.emit(assign(u4, bitfield_extract(u, constant(8), constant(8)),
+         factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
                               WRITEMASK_Y));
  
           /* u4.z = bitfield_extract(u, 16, 8); */
-         factory.emit(assign(u4, bitfield_extract(u, constant(16), constant(8)),
+         factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
                               WRITEMASK_Z));
        } else {
           /* u4.y = (u >> 8u) & 0xffu; */
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp

index db8b0ca..5a1bbc4 100644 (file)
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -46,7 +46,7 @@ namespace {
  class nir_visitor : public ir_visitor
  {
  public:
-   nir_visitor(nir_shader *shader);
+   nir_visitor(nir_shader *shader, gl_shader *sh);
     ~nir_visitor();
  
     virtual void visit(ir_variable *);
@@ -70,10 +70,9 @@ public:
     virtual void visit(ir_dereference_array *);
     virtual void visit(ir_barrier *);
  
-   void create_function(ir_function *ir);
+   void create_function(ir_function_signature *ir);
  
  private:
-   void create_overload(ir_function_signature *ir, nir_function *function);
     void add_instr(nir_instr *instr, unsigned num_components);
     nir_ssa_def *evaluate_rvalue(ir_rvalue *ir);
  
@@ -86,6 +85,8 @@ private:
  
     bool supports_ints;
  
+   struct gl_shader *sh;
+
     nir_shader *shader;
     nir_function_impl *impl;
     nir_builder b;
@@ -139,12 +140,21 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
  
     nir_shader *shader = nir_shader_create(NULL, stage, options);
  
-   nir_visitor v1(shader);
+   nir_visitor v1(shader, sh);
     nir_function_visitor v2(&v1);
     v2.run(sh->ir);
     visit_exec_list(sh->ir, &v1);
  
-   nir_lower_outputs_to_temporaries(shader);
+   nir_function *main = NULL;
+   nir_foreach_function(shader, func) {
+      if (strcmp(func->name, "main") == 0) {
+         main = func;
+         break;
+      }
+   }
+   assert(main);
+
+   nir_lower_outputs_to_temporaries(shader, main);
  
     shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
     if (shader_prog->Label)
@@ -205,10 +215,11 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
     return shader;
  }
  
-nir_visitor::nir_visitor(nir_shader *shader)
+nir_visitor::nir_visitor(nir_shader *shader, gl_shader *sh)
  {
     this->supports_ints = shader->options->native_integers;
     this->shader = shader;
+   this->sh = sh;
     this->is_global = true;
     this->var_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                               _mesa_key_pointer_equal);
@@ -364,7 +375,6 @@ nir_visitor::visit(ir_variable *ir)
     var->data.explicit_index = ir->data.explicit_index;
     var->data.explicit_binding = ir->data.explicit_binding;
     var->data.has_initializer = ir->data.has_initializer;
-   var->data.is_unmatched_generic_inout = ir->data.is_unmatched_generic_inout;
     var->data.location_frac = ir->data.location_frac;
     var->data.from_named_ifc_block_array = ir->data.from_named_ifc_block_array;
     var->data.from_named_ifc_block_nonarray = ir->data.from_named_ifc_block_nonarray;
@@ -390,8 +400,9 @@ nir_visitor::visit(ir_variable *ir)
     }
  
     var->data.index = ir->data.index;
+   var->data.descriptor_set = 0;
     var->data.binding = ir->data.binding;
-   var->data.atomic.offset = ir->data.atomic.offset;
+   var->data.offset = ir->data.offset;
     var->data.image.read_only = ir->data.image_read_only;
     var->data.image.write_only = ir->data.image_write_only;
     var->data.image.coherent = ir->data.image_coherent;
@@ -431,60 +442,50 @@ nir_visitor::visit(ir_variable *ir)
  ir_visitor_status
  nir_function_visitor::visit_enter(ir_function *ir)
  {
-   visitor->create_function(ir);
-   return visit_continue_with_parent;
-}
-
-
-void
-nir_visitor::create_function(ir_function *ir)
-{
-   nir_function *func = nir_function_create(this->shader, ir->name);
     foreach_in_list(ir_function_signature, sig, &ir->signatures) {
-      create_overload(sig, func);
+      visitor->create_function(sig);
     }
+   return visit_continue_with_parent;
  }
  
-
-
  void
-nir_visitor::create_overload(ir_function_signature *ir, nir_function *function)
+nir_visitor::create_function(ir_function_signature *ir)
  {
     if (ir->is_intrinsic)
        return;
  
-   nir_function_overload *overload = nir_function_overload_create(function);
+   nir_function *func = nir_function_create(shader, ir->function_name());
  
     unsigned num_params = ir->parameters.length();
-   overload->num_params = num_params;
-   overload->params = ralloc_array(shader, nir_parameter, num_params);
+   func->num_params = num_params;
+   func->params = ralloc_array(shader, nir_parameter, num_params);
  
     unsigned i = 0;
     foreach_in_list(ir_variable, param, &ir->parameters) {
        switch (param->data.mode) {
        case ir_var_function_in:
-         overload->params[i].param_type = nir_parameter_in;
+         func->params[i].param_type = nir_parameter_in;
           break;
  
        case ir_var_function_out:
-         overload->params[i].param_type = nir_parameter_out;
+         func->params[i].param_type = nir_parameter_out;
           break;
  
        case ir_var_function_inout:
-         overload->params[i].param_type = nir_parameter_inout;
+         func->params[i].param_type = nir_parameter_inout;
           break;
  
        default:
           unreachable("not reached");
        }
  
-      overload->params[i].type = param->type;
+      func->params[i].type = param->type;
        i++;
     }
  
-   overload->return_type = ir->return_type;
+   func->return_type = ir->return_type;
  
-   _mesa_hash_table_insert(this->overload_table, ir, overload);
+   _mesa_hash_table_insert(this->overload_table, ir, func);
  }
  
  void
@@ -504,13 +505,13 @@ nir_visitor::visit(ir_function_signature *ir)
        _mesa_hash_table_search(this->overload_table, ir);
  
     assert(entry);
-   nir_function_overload *overload = (nir_function_overload *) entry->data;
+   nir_function *func = (nir_function *) entry->data;
  
     if (ir->is_defined) {
-      nir_function_impl *impl = nir_function_impl_create(overload);
+      nir_function_impl *impl = nir_function_impl_create(func);
        this->impl = impl;
  
-      unsigned num_params = overload->num_params;
+      unsigned num_params = func->num_params;
        impl->num_params = num_params;
        impl->params = ralloc_array(this->shader, nir_variable *, num_params);
        unsigned i = 0;
@@ -520,13 +521,13 @@ nir_visitor::visit(ir_function_signature *ir)
           i++;
        }
  
-      if (overload->return_type == glsl_type::void_type) {
+      if (func->return_type == glsl_type::void_type) {
           impl->return_var = NULL;
        } else {
           impl->return_var = ralloc(this->shader, nir_variable);
           impl->return_var->name = ralloc_strdup(impl->return_var,
                                                  "return_var");
-         impl->return_var->type = overload->return_type;
+         impl->return_var->type = func->return_type;
        }
  
        this->is_global = false;
@@ -537,7 +538,7 @@ nir_visitor::visit(ir_function_signature *ir)
  
        this->is_global = true;
     } else {
-      overload->impl = NULL;
+      func->impl = NULL;
     }
  }
  
@@ -1068,6 +1069,7 @@ nir_visitor::visit(ir_call *ir)
           nir_intrinsic_instr *store_instr =
              nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
           store_instr->num_components = ir->return_deref->type->vector_elements;
+         store_instr->const_index[0] = (1 << store_instr->num_components) - 1;
  
           store_instr->variables[0] =
              evaluate_deref(&store_instr->instr, ir->return_deref);
@@ -1082,7 +1084,7 @@ nir_visitor::visit(ir_call *ir)
     struct hash_entry *entry =
        _mesa_hash_table_search(this->overload_table, ir->callee);
     assert(entry);
-   nir_function_overload *callee = (nir_function_overload *) entry->data;
+   nir_function *callee = (nir_function *) entry->data;
  
     nir_call_instr *instr = nir_call_instr_create(this->shader, callee);
  
@@ -1129,43 +1131,23 @@ nir_visitor::visit(ir_assignment *ir)
     nir_ssa_def *src = evaluate_rvalue(ir->rhs);
  
     if (ir->write_mask != (1 << num_components) - 1 && ir->write_mask != 0) {
-      /*
-       * We have no good way to update only part of a variable, so just load
-       * the LHS and do a vec operation to combine the old with the new, and
-       * then store it
-       * back into the LHS. Copy propagation should get rid of the mess.
+      /* GLSL IR will give us the input to the write-masked assignment in a
+       * single packed vector.  So, for example, if the writemask is xzw, then
+       * we have to swizzle x -> x, y -> z, and z -> w and get the y component
+       * from the load.
         */
-
-      nir_intrinsic_instr *load =
-         nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_var);
-      load->num_components = ir->lhs->type->vector_elements;
-      nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
-      load->variables[0] = lhs_deref;
-      ralloc_steal(load, load->variables[0]);
-      nir_builder_instr_insert(&b, &load->instr);
-
-      nir_ssa_def *srcs[4];
-
+      unsigned swiz[4];
        unsigned component = 0;
-      for (unsigned i = 0; i < ir->lhs->type->vector_elements; i++) {
-         if (ir->write_mask & (1 << i)) {
-            /* GLSL IR will give us the input to the write-masked assignment
-             * in a single packed vector.  So, for example, if the
-             * writemask is xzw, then we have to swizzle x -> x, y -> z,
-             * and z -> w and get the y component from the load.
-             */
-            srcs[i] = nir_channel(&b, src, component++);
-         } else {
-            srcs[i] = nir_channel(&b, &load->dest.ssa, i);
-         }
+      for (unsigned i = 0; i < 4; i++) {
+         swiz[i] = ir->write_mask & (1 << i) ? component++ : 0;
        }
-
-      src = nir_vec(&b, srcs, ir->lhs->type->vector_elements);
+      src = nir_swizzle(&b, src, swiz, num_components, !supports_ints);
     }
  
     nir_intrinsic_instr *store =
        nir_intrinsic_instr_create(this->shader, nir_intrinsic_store_var);
     store->num_components = ir->lhs->type->vector_elements;
+   store->const_index[0] = ir->write_mask;
     nir_deref *store_deref = nir_copy_deref(store, &lhs_deref->deref);
     store->variables[0] = nir_deref_as_var(store_deref);
     store->src[0] = nir_src_for_ssa(src);
@@ -1417,24 +1399,6 @@ nir_visitor::visit(ir_expression *ir)
        /* no-op */
        result = nir_imov(&b, srcs[0]);
        break;
-   case ir_unop_any:
-      switch (ir->operands[0]->type->vector_elements) {
-      case 2:
-         result = supports_ints ? nir_bany2(&b, srcs[0])
-                                : nir_fany2(&b, srcs[0]);
-         break;
-      case 3:
-         result = supports_ints ? nir_bany3(&b, srcs[0])
-                                : nir_fany3(&b, srcs[0]);
-         break;
-      case 4:
-         result = supports_ints ? nir_bany4(&b, srcs[0])
-                                : nir_fany4(&b, srcs[0]);
-         break;
-      default:
-         unreachable("not reached");
-      }
-      break;
     case ir_unop_trunc: result = nir_ftrunc(&b, srcs[0]); break;
     case ir_unop_ceil:  result = nir_fceil(&b, srcs[0]); break;
     case ir_unop_floor: result = nir_ffloor(&b, srcs[0]); break;
@@ -1770,7 +1734,6 @@ nir_visitor::visit(ir_expression *ir)
     case ir_binop_pack_half_2x16_split:
           result = nir_pack_half_2x16_split(&b, srcs[0], srcs[1]);
           break;
-   case ir_binop_bfm:   result = nir_bfm(&b, srcs[0], srcs[1]);   break;
     case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break;
     case ir_triop_fma:
        result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]);
@@ -1784,9 +1747,6 @@ nir_visitor::visit(ir_expression *ir)
        else
           result = nir_fcsel(&b, srcs[0], srcs[1], srcs[2]);
        break;
-   case ir_triop_bfi:
-      result = nir_bfi(&b, srcs[0], srcs[1], srcs[2]);
-      break;
     case ir_triop_bitfield_extract:
        result = (out_type == GLSL_TYPE_INT) ?
           nir_ibitfield_extract(&b, srcs[0], srcs[1], srcs[2]) :
diff --git a/src/glsl/nir/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp

index 3cf2f03..71095fa 100644 (file)
--- a/src/glsl/nir/glsl_types.cpp
+++ b/src/glsl/nir/glsl_types.cpp
@@ -32,6 +32,7 @@ mtx_t glsl_type::mutex = _MTX_INITIALIZER_NP;
  hash_table *glsl_type::array_types = NULL;
  hash_table *glsl_type::record_types = NULL;
  hash_table *glsl_type::interface_types = NULL;
+hash_table *glsl_type::function_types = NULL;
  hash_table *glsl_type::subroutine_types = NULL;
  void *glsl_type::mem_ctx = NULL;
  
@@ -169,6 +170,39 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
     mtx_unlock(&glsl_type::mutex);
  }
  
+glsl_type::glsl_type(const glsl_type *return_type,
+                     const glsl_function_param *params, unsigned num_params) :
+   gl_type(0),
+   base_type(GLSL_TYPE_FUNCTION),
+   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
+   sampler_type(0), interface_packing(0),
+   vector_elements(0), matrix_columns(0),
+   length(num_params)
+{
+   unsigned int i;
+
+   mtx_lock(&glsl_type::mutex);
+
+   init_ralloc_type_ctx();
+
+   this->fields.parameters = rzalloc_array(this->mem_ctx,
+                                           glsl_function_param, num_params + 1);
+
+   /* We store the return type as the first parameter */
+   this->fields.parameters[0].type = return_type;
+   this->fields.parameters[0].in = false;
+   this->fields.parameters[0].out = true;
+
+   /* We store the i'th parameter in slot i+1 */
+   for (i = 0; i < length; i++) {
+      this->fields.parameters[i + 1].type = params[i].type;
+      this->fields.parameters[i + 1].in = params[i].in;
+      this->fields.parameters[i + 1].out = params[i].out;
+   }
+
+   mtx_unlock(&glsl_type::mutex);
+}
+
  glsl_type::glsl_type(const char *subroutine_name) :
     gl_type(0),
     base_type(GLSL_TYPE_SUBROUTINE),
@@ -681,6 +715,93 @@ glsl_type::get_sampler_instance(enum glsl_sampler_dim dim,
  }
  
  const glsl_type *
+glsl_type::get_image_instance(enum glsl_sampler_dim dim,
+                              bool array, glsl_base_type type)
+{
+   switch (type) {
+   case GLSL_TYPE_FLOAT:
+      switch (dim) {
+      case GLSL_SAMPLER_DIM_1D:
+         return (array ? image1DArray_type : image1D_type);
+      case GLSL_SAMPLER_DIM_2D:
+         return (array ? image2DArray_type : image2D_type);
+      case GLSL_SAMPLER_DIM_3D:
+         return image3D_type;
+      case GLSL_SAMPLER_DIM_CUBE:
+         return (array ? imageCubeArray_type : imageCube_type);
+      case GLSL_SAMPLER_DIM_RECT:
+         if (array)
+            return error_type;
+         else
+            return image2DRect_type;
+      case GLSL_SAMPLER_DIM_BUF:
+         if (array)
+            return error_type;
+         else
+            return imageBuffer_type;
+      case GLSL_SAMPLER_DIM_MS:
+         return (array ? image2DMSArray_type : image2DMS_type);
+      case GLSL_SAMPLER_DIM_EXTERNAL:
+         return error_type;
+      }
+   case GLSL_TYPE_INT:
+      switch (dim) {
+      case GLSL_SAMPLER_DIM_1D:
+         return (array ? iimage1DArray_type : iimage1D_type);
+      case GLSL_SAMPLER_DIM_2D:
+         return (array ? iimage2DArray_type : iimage2D_type);
+      case GLSL_SAMPLER_DIM_3D:
+         if (array)
+            return error_type;
+         return iimage3D_type;
+      case GLSL_SAMPLER_DIM_CUBE:
+         return (array ? iimageCubeArray_type : iimageCube_type);
+      case GLSL_SAMPLER_DIM_RECT:
+         if (array)
+            return error_type;
+         return iimage2DRect_type;
+      case GLSL_SAMPLER_DIM_BUF:
+         if (array)
+            return error_type;
+         return iimageBuffer_type;
+      case GLSL_SAMPLER_DIM_MS:
+         return (array ? iimage2DMSArray_type : iimage2DMS_type);
+      case GLSL_SAMPLER_DIM_EXTERNAL:
+         return error_type;
+      }
+   case GLSL_TYPE_UINT:
+      switch (dim) {
+      case GLSL_SAMPLER_DIM_1D:
+         return (array ? uimage1DArray_type : uimage1D_type);
+      case GLSL_SAMPLER_DIM_2D:
+         return (array ? uimage2DArray_type : uimage2D_type);
+      case GLSL_SAMPLER_DIM_3D:
+         if (array)
+            return error_type;
+         return uimage3D_type;
+      case GLSL_SAMPLER_DIM_CUBE:
+         return (array ? uimageCubeArray_type : uimageCube_type);
+      case GLSL_SAMPLER_DIM_RECT:
+         if (array)
+            return error_type;
+         return uimage2DRect_type;
+      case GLSL_SAMPLER_DIM_BUF:
+         if (array)
+            return error_type;
+         return uimageBuffer_type;
+      case GLSL_SAMPLER_DIM_MS:
+         return (array ? uimage2DMSArray_type : uimage2DMS_type);
+      case GLSL_SAMPLER_DIM_EXTERNAL:
+         return error_type;
+      }
+   default:
+      return error_type;
+   }
+
+   unreachable("switch statement above should be complete");
+}
+
+const glsl_type *
  glsl_type::get_array_instance(const glsl_type *base, unsigned array_size)
  {
     /* Generate a name using the base type pointer in the key.  This is
@@ -924,6 +1045,74 @@ glsl_type::get_subroutine_instance(const char *subroutine_name)
  }
  
  
+static bool
+function_key_compare(const void *a, const void *b)
+{
+   const glsl_type *const key1 = (glsl_type *) a;
+   const glsl_type *const key2 = (glsl_type *) b;
+
+   if (key1->length != key2->length)
+      return 1;
+
+   return memcmp(key1->fields.parameters, key2->fields.parameters,
+                 (key1->length + 1) * sizeof(*key1->fields.parameters)) == 0;
+}
+
+
+static uint32_t
+function_key_hash(const void *a)
+{
+   const glsl_type *const key = (glsl_type *) a;
+   char hash_key[128];
+   unsigned size = 0;
+
+   size = snprintf(hash_key, sizeof(hash_key), "%08x", key->length);
+
+   for (unsigned i = 0; i < key->length; i++) {
+      if (size >= sizeof(hash_key))
+        break;
+
+      size += snprintf(& hash_key[size], sizeof(hash_key) - size,
+                      "%p", (void *) key->fields.structure[i].type);
+   }
+
+   return _mesa_hash_string(hash_key);
+}
+
+const glsl_type *
+glsl_type::get_function_instance(const glsl_type *return_type,
+                                 const glsl_function_param *params,
+                                 unsigned num_params)
+{
+   const glsl_type key(return_type, params, num_params);
+
+   mtx_lock(&glsl_type::mutex);
+
+   if (function_types == NULL) {
+      function_types = _mesa_hash_table_create(NULL, function_key_hash,
+                                               function_key_compare);
+   }
+
+   struct hash_entry *entry = _mesa_hash_table_search(function_types, &key);
+   if (entry == NULL) {
+      mtx_unlock(&glsl_type::mutex);
+      const glsl_type *t = new glsl_type(return_type, params, num_params);
+      mtx_lock(&glsl_type::mutex);
+
+      entry = _mesa_hash_table_insert(function_types, t, (void *) t);
+   }
+
+   const glsl_type *t = (const glsl_type *)entry->data;
+
+   assert(t->base_type == GLSL_TYPE_FUNCTION);
+   assert(t->length == num_params);
+
+   mtx_unlock(&glsl_type::mutex);
+
+   return t;
+}
+
+
  const glsl_type *
  glsl_type::get_mul_type(const glsl_type *type_a, const glsl_type *type_b)
  {
@@ -1053,6 +1242,8 @@ glsl_type::component_slots() const
        return 1;
     case GLSL_TYPE_SUBROUTINE:
       return 1;
+
+   case GLSL_TYPE_FUNCTION:
     case GLSL_TYPE_SAMPLER:
     case GLSL_TYPE_ATOMIC_UINT:
     case GLSL_TYPE_VOID:
@@ -1640,7 +1831,7 @@ glsl_type::std430_size(bool row_major) const
  }
  
  unsigned
-glsl_type::count_attribute_slots() const
+glsl_type::count_attribute_slots(bool vertex_input_slots) const
  {
     /* From page 31 (page 37 of the PDF) of the GLSL 1.50 spec:
      *
@@ -1661,28 +1852,37 @@ glsl_type::count_attribute_slots() const
      * allows varying structs, the number of varying slots taken up by a
      * varying struct is simply equal to the sum of the number of slots taken
      * up by each element.
+    *
+    * Doubles are counted different depending on whether they are vertex
+    * inputs or everything else. Vertex inputs from ARB_vertex_attrib_64bit
+    * take one location no matter what size they are, otherwise dvec3/4
+    * take two locations.
      */
     switch (this->base_type) {
     case GLSL_TYPE_UINT:
     case GLSL_TYPE_INT:
     case GLSL_TYPE_FLOAT:
     case GLSL_TYPE_BOOL:
-   case GLSL_TYPE_DOUBLE:
        return this->matrix_columns;
-
+   case GLSL_TYPE_DOUBLE:
+      if (this->vector_elements > 2 && !vertex_input_slots)
+         return this->matrix_columns * 2;
+      else
+         return this->matrix_columns;
     case GLSL_TYPE_STRUCT:
     case GLSL_TYPE_INTERFACE: {
        unsigned size = 0;
  
        for (unsigned i = 0; i < this->length; i++)
-         size += this->fields.structure[i].type->count_attribute_slots();
+         size += this->fields.structure[i].type->count_attribute_slots(vertex_input_slots);
  
        return size;
     }
  
     case GLSL_TYPE_ARRAY:
-      return this->length * this->fields.array->count_attribute_slots();
+      return this->length * this->fields.array->count_attribute_slots(vertex_input_slots);
  
+   case GLSL_TYPE_FUNCTION:
     case GLSL_TYPE_SAMPLER:
     case GLSL_TYPE_IMAGE:
     case GLSL_TYPE_ATOMIC_UINT:
diff --git a/src/glsl/nir/glsl_types.h b/src/glsl/nir/glsl_types.h

index d8a999a..ff8dcc7 100644 (file)
--- a/src/glsl/nir/glsl_types.h
+++ b/src/glsl/nir/glsl_types.h
@@ -56,6 +56,7 @@ enum glsl_base_type {
     GLSL_TYPE_IMAGE,
     GLSL_TYPE_ATOMIC_UINT,
     GLSL_TYPE_STRUCT,
+   GLSL_TYPE_FUNCTION,
     GLSL_TYPE_INTERFACE,
     GLSL_TYPE_ARRAY,
     GLSL_TYPE_VOID,
@@ -187,7 +188,7 @@ struct glsl_type {
      */
     union {
        const struct glsl_type *array;            /**< Type of array elements. */
-      const struct glsl_type *parameters;       /**< Parameters to function. */
+      struct glsl_function_param *parameters;   /**< Parameters to function. */
        struct glsl_struct_field *structure;      /**< List of struct fields. */
     } fields;
  
@@ -250,6 +251,8 @@ struct glsl_type {
                                                  bool array,
                                                  glsl_base_type type);
  
+   static const glsl_type *get_image_instance(enum glsl_sampler_dim dim,
+                                              bool array, glsl_base_type type);
  
     /**
      * Get the instance of an array type
@@ -278,6 +281,13 @@ struct glsl_type {
     static const glsl_type *get_subroutine_instance(const char *subroutine_name);
  
     /**
+    * Get the instance of a function type
+    */
+   static const glsl_type *get_function_instance(const struct glsl_type *return_type,
+                                                 const glsl_function_param *parameters,
+                                                 unsigned num_params);
+
+   /**
      * Get the type resulting from a multiplication of \p type_a * \p type_b
      */
     static const glsl_type *get_mul_type(const glsl_type *type_a,
@@ -324,8 +334,11 @@ struct glsl_type {
      * varying slots the type will use up in the absence of varying packing
      * (and thus, it can be used to measure the number of varying slots used by
      * the varyings that are generated by lower_packed_varyings).
+    *
+    * For vertex shader attributes - doubles only take one slot.
+    * For inter-shader varyings - dvec3/dvec4 take two slots.
      */
-   unsigned count_attribute_slots() const;
+   unsigned count_attribute_slots(bool vertex_input_slots) const;
  
     /**
      * Alignment in bytes of the start of this type in a std140 uniform
@@ -471,6 +484,14 @@ struct glsl_type {
     }
  
     /**
+    * Query whether a double takes two slots.
+    */
+   bool is_dual_slot_double() const
+   {
+      return base_type == GLSL_TYPE_DOUBLE && vector_elements > 2;
+   }
+
+   /**
      * Query whether or not a type is a non-array boolean type
      */
     bool is_boolean() const
@@ -747,6 +768,10 @@ private:
     glsl_type(const glsl_struct_field *fields, unsigned num_fields,
              enum glsl_interface_packing packing, const char *name);
  
+   /** Constructor for interface types */
+   glsl_type(const glsl_type *return_type,
+             const glsl_function_param *params, unsigned num_params);
+
     /** Constructor for array types */
     glsl_type(const glsl_type *array, unsigned length);
  
@@ -765,6 +790,9 @@ private:
     /** Hash table containing the known subroutine types. */
     static struct hash_table *subroutine_types;
  
+   /** Hash table containing the known function types. */
+   static struct hash_table *function_types;
+
     static bool record_key_compare(const void *a, const void *b);
     static unsigned record_key_hash(const void *key);
  
@@ -792,6 +820,10 @@ private:
     /*@}*/
  };
  
+#undef DECL_TYPE
+#undef STRUCT_TYPE
+#endif /* __cplusplus */
+
  struct glsl_struct_field {
     const struct glsl_type *type;
     const char *name;
@@ -849,6 +881,7 @@ struct glsl_struct_field {
     unsigned image_volatile:1;
     unsigned image_restrict:1;
  
+#ifdef __cplusplus
     glsl_struct_field(const struct glsl_type *_type, const char *_name)
        : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
          sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0),
@@ -861,6 +894,14 @@ struct glsl_struct_field {
     {
        /* empty */
     }
+#endif
+};
+
+struct glsl_function_param {
+   const struct glsl_type *type;
+
+   bool in;
+   bool out;
  };
  
  static inline unsigned int
@@ -869,8 +910,4 @@ glsl_align(unsigned int a, unsigned int align)
     return (a + align - 1) / align * align;
  }
  
-#undef DECL_TYPE
-#undef STRUCT_TYPE
-#endif /* __cplusplus */
-
  #endif /* GLSL_TYPES_H */
diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c

index 35fc1de..42a53f6 100644 (file)
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -39,6 +39,7 @@ nir_shader_create(void *mem_ctx,
     exec_list_make_empty(&shader->uniforms);
     exec_list_make_empty(&shader->inputs);
     exec_list_make_empty(&shader->outputs);
+   exec_list_make_empty(&shader->shared);
  
     shader->options = options;
     memset(&shader->info, 0, sizeof(shader->info));
@@ -52,6 +53,7 @@ nir_shader_create(void *mem_ctx,
     shader->num_inputs = 0;
     shader->num_outputs = 0;
     shader->num_uniforms = 0;
+   shader->num_shared = 0;
  
     shader->stage = stage;
  
@@ -132,6 +134,11 @@ nir_shader_add_variable(nir_shader *shader, nir_variable *var)
        exec_list_push_tail(&shader->uniforms, &var->node);
        break;
  
+   case nir_var_shared:
+      assert(shader->stage == MESA_SHADER_COMPUTE);
+      exec_list_push_tail(&shader->shared, &var->node);
+      break;
+
     case nir_var_system_value:
        exec_list_push_tail(&shader->system_values, &var->node);
        break;
@@ -163,7 +170,7 @@ nir_variable *
  nir_local_variable_create(nir_function_impl *impl,
                            const struct glsl_type *type, const char *name)
  {
-   nir_variable *var = rzalloc(impl->overload->function->shader, nir_variable);
+   nir_variable *var = rzalloc(impl->function->shader, nir_variable);
     var->name = ralloc_strdup(var, name);
     var->type = type;
     var->data.mode = nir_var_local;
@@ -179,31 +186,17 @@ nir_function_create(nir_shader *shader, const char *name)
     nir_function *func = ralloc(shader, nir_function);
  
     exec_list_push_tail(&shader->functions, &func->node);
-   exec_list_make_empty(&func->overload_list);
+
     func->name = ralloc_strdup(func, name);
     func->shader = shader;
+   func->num_params = 0;
+   func->params = NULL;
+   func->return_type = glsl_void_type();
+   func->impl = NULL;
  
     return func;
  }
  
-nir_function_overload *
-nir_function_overload_create(nir_function *func)
-{
-   void *mem_ctx = ralloc_parent(func);
-
-   nir_function_overload *overload = ralloc(mem_ctx, nir_function_overload);
-
-   overload->num_params = 0;
-   overload->params = NULL;
-   overload->return_type = glsl_void_type();
-   overload->impl = NULL;
-
-   exec_list_push_tail(&func->overload_list, &overload->node);
-   overload->function = func;
-
-   return overload;
-}
-
  void nir_src_copy(nir_src *dest, const nir_src *src, void *mem_ctx)
  {
     dest->is_ssa = src->is_ssa;
@@ -268,16 +261,11 @@ cf_init(nir_cf_node *node, nir_cf_node_type type)
  }
  
  nir_function_impl *
-nir_function_impl_create(nir_function_overload *overload)
+nir_function_impl_create_bare(nir_shader *shader)
  {
-   assert(overload->impl == NULL);
-
-   void *mem_ctx = ralloc_parent(overload);
-
-   nir_function_impl *impl = ralloc(mem_ctx, nir_function_impl);
+   nir_function_impl *impl = ralloc(shader, nir_function_impl);
  
-   overload->impl = impl;
-   impl->overload = overload;
+   impl->function = NULL;
  
     cf_init(&impl->cf_node, nir_cf_node_function);
  
@@ -292,8 +280,8 @@ nir_function_impl_create(nir_function_overload *overload)
     impl->valid_metadata = nir_metadata_none;
  
     /* create start & end blocks */
-   nir_block *start_block = nir_block_create(mem_ctx);
-   nir_block *end_block = nir_block_create(mem_ctx);
+   nir_block *start_block = nir_block_create(shader);
+   nir_block *end_block = nir_block_create(shader);
     start_block->cf_node.parent = &impl->cf_node;
     end_block->cf_node.parent = &impl->cf_node;
     impl->end_block = end_block;
@@ -305,6 +293,23 @@ nir_function_impl_create(nir_function_overload *overload)
     return impl;
  }
  
+nir_function_impl *
+nir_function_impl_create(nir_function *function)
+{
+   assert(function->impl == NULL);
+
+   nir_function_impl *impl = nir_function_impl_create_bare(function->shader);
+
+   function->impl = impl;
+   impl->function = function;
+
+   impl->num_params = function->num_params;
+   impl->params = ralloc_array(function->shader,
+                               nir_variable *, impl->num_params);
+
+   return impl;
+}
+
  nir_block *
  nir_block_create(nir_shader *shader)
  {
@@ -474,7 +479,7 @@ nir_intrinsic_instr_create(nir_shader *shader, nir_intrinsic_op op)
  }
  
  nir_call_instr *
-nir_call_instr_create(nir_shader *shader, nir_function_overload *callee)
+nir_call_instr_create(nir_shader *shader, nir_function *callee)
  {
     nir_call_instr *instr = ralloc(shader, nir_call_instr);
     instr_init(&instr->instr, nir_instr_type_call);
@@ -500,8 +505,10 @@ nir_tex_instr_create(nir_shader *shader, unsigned num_srcs)
     for (unsigned i = 0; i < num_srcs; i++)
        src_init(&instr->src[i].src);
  
+   instr->texture_index = 0;
+   instr->texture_array_size = 0;
+   instr->texture = NULL;
     instr->sampler_index = 0;
-   instr->sampler_array_size = 0;
     instr->sampler = NULL;
  
     return instr;
@@ -696,6 +703,69 @@ nir_cf_node_get_function(nir_cf_node *node)
     return nir_cf_node_as_function(node);
  }
  
+/* Reduces a cursor by trying to convert everything to after and trying to
+ * go up to block granularity when possible.
+ */
+static nir_cursor
+reduce_cursor(nir_cursor cursor)
+{
+   switch (cursor.option) {
+   case nir_cursor_before_block:
+      if (exec_list_is_empty(&cursor.block->instr_list)) {
+         /* Empty block.  After is as good as before. */
+         cursor.option = nir_cursor_after_block;
+      } else {
+         /* Try to switch to after the previous block if there is one.
+          * (This isn't likely, but it can happen.)
+          */
+         nir_cf_node *prev_node = nir_cf_node_prev(&cursor.block->cf_node);
+         if (prev_node && prev_node->type == nir_cf_node_block) {
+            cursor.block = nir_cf_node_as_block(prev_node);
+            cursor.option = nir_cursor_after_block;
+         }
+      }
+      return cursor;
+
+   case nir_cursor_after_block:
+      return cursor;
+
+   case nir_cursor_before_instr: {
+      nir_instr *prev_instr = nir_instr_prev(cursor.instr);
+      if (prev_instr) {
+         /* Before this instruction is after the previous */
+         cursor.instr = prev_instr;
+         cursor.option = nir_cursor_after_instr;
+      } else {
+         /* No previous instruction.  Switch to before block */
+         cursor.block = cursor.instr->block;
+         cursor.option = nir_cursor_before_block;
+      }
+      return reduce_cursor(cursor);
+   }
+
+   case nir_cursor_after_instr:
+      if (nir_instr_next(cursor.instr) == NULL) {
+         /* This is the last instruction, switch to after block */
+         cursor.option = nir_cursor_after_block;
+         cursor.block = cursor.instr->block;
+      }
+      return cursor;
+
+   default:
+      unreachable("Inavlid cursor option");
+   }
+}
+
+bool
+nir_cursors_equal(nir_cursor a, nir_cursor b)
+{
+   /* Reduced cursors should be unique */
+   a = reduce_cursor(a);
+   b = reduce_cursor(b);
+
+   return a.block == b.block && a.option == b.option;
+}
+
  static bool
  add_use_cb(nir_src *src, void *state)
  {
@@ -1019,6 +1089,10 @@ visit_tex_src(nir_tex_instr *instr, nir_foreach_src_cb cb, void *state)
        if (!visit_src(&instr->src[i].src, cb, state))
           return false;
  
+   if (instr->texture != NULL)
+      if (!visit_deref_src(instr->texture, cb, state))
+         return false;
+
     if (instr->sampler != NULL)
        if (!visit_deref_src(instr->sampler, cb, state))
           return false;
@@ -1588,6 +1662,10 @@ nir_intrinsic_from_system_value(gl_system_value val)
        return nir_intrinsic_load_vertex_id;
     case SYSTEM_VALUE_INSTANCE_ID:
        return nir_intrinsic_load_instance_id;
+   case SYSTEM_VALUE_DRAW_ID:
+      return nir_intrinsic_load_draw_id;
+   case SYSTEM_VALUE_BASE_INSTANCE:
+      return nir_intrinsic_load_base_instance;
     case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
        return nir_intrinsic_load_vertex_id_zero_base;
     case SYSTEM_VALUE_BASE_VERTEX:
@@ -1633,6 +1711,10 @@ nir_system_value_from_intrinsic(nir_intrinsic_op intrin)
        return SYSTEM_VALUE_VERTEX_ID;
     case nir_intrinsic_load_instance_id:
        return SYSTEM_VALUE_INSTANCE_ID;
+   case nir_intrinsic_load_draw_id:
+      return SYSTEM_VALUE_DRAW_ID;
+   case nir_intrinsic_load_base_instance:
+      return SYSTEM_VALUE_BASE_INSTANCE;
     case nir_intrinsic_load_vertex_id_zero_base:
        return SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
     case nir_intrinsic_load_base_vertex:
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h

index 2e72e66..4e35331 100644 (file)
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -65,7 +65,6 @@ name(const in_type *parent)                              \
     return exec_node_data(out_type, parent, field);       \
  }
  
-struct nir_function_overload;
  struct nir_function;
  struct nir_shader;
  struct nir_instr;
@@ -89,6 +88,7 @@ typedef enum {
     nir_var_local,
     nir_var_uniform,
     nir_var_shader_storage,
+   nir_var_shared,
     nir_var_system_value
  } nir_variable_mode;
  
@@ -216,15 +216,6 @@ typedef struct {
        unsigned has_initializer:1;
  
        /**
-       * Is this variable a generic output or input that has not yet been matched
-       * up to a variable in another stage of the pipeline?
-       *
-       * This is used by the linker as scratch storage while assigning locations
-       * to generic inputs and outputs.
-       */
-      unsigned is_unmatched_generic_inout:1;
-
-      /**
         * If non-zero, then this variable may be packed along with other variables
         * into a single varying slot, so this offset should be applied when
         * accessing components.  For example, an offset of 1 means that the x
@@ -292,6 +283,11 @@ typedef struct {
        int index;
  
        /**
+       * Descriptor set binding for sampler or UBO.
+       */
+      int descriptor_set;
+
+      /**
         * Initial binding point for a sampler or UBO.
         *
         * For array types, this represents the binding point for the first element.
@@ -301,9 +297,7 @@ typedef struct {
        /**
         * Location an atomic counter is stored at.
         */
-      struct {
-         unsigned offset;
-      } atomic;
+      unsigned offset;
  
        /**
         * ARB_shader_image_load_store qualifiers.
@@ -361,6 +355,34 @@ typedef struct {
  #define nir_foreach_variable(var, var_list) \
     foreach_list_typed(nir_variable, var, node, var_list)
  
+/**
+ * Returns the bits in the inputs_read, outputs_written, or
+ * system_values_read bitfield corresponding to this variable.
+ */
+static inline uint64_t
+nir_variable_get_io_mask(nir_variable *var, gl_shader_stage stage)
+{
+   assert(var->data.mode == nir_var_shader_in ||
+          var->data.mode == nir_var_shader_out ||
+          var->data.mode == nir_var_system_value);
+   assert(var->data.location >= 0);
+
+   const struct glsl_type *var_type = var->type;
+   if (stage == MESA_SHADER_GEOMETRY && var->data.mode == nir_var_shader_in) {
+      /* Most geometry shader inputs are per-vertex arrays */
+      if (var->data.location >= VARYING_SLOT_VAR0)
+         assert(glsl_type_is_array(var_type));
+
+      if (glsl_type_is_array(var_type))
+         var_type = glsl_get_array_element(var_type);
+   }
+
+   bool is_vertex_input = (var->data.mode == nir_var_shader_in &&
+                           stage == MESA_SHADER_VERTEX);
+   unsigned slots = glsl_count_attribute_slots(var_type, is_vertex_input);
+   return ((1ull << slots) - 1) << var->data.location;
+}
+
  typedef struct {
     struct exec_node node;
  
@@ -515,7 +537,11 @@ typedef struct nir_src {
     bool is_ssa;
  } nir_src;
  
-#define NIR_SRC_INIT (nir_src) { { NULL } }
+#ifdef __cplusplus
+#  define NIR_SRC_INIT nir_src()
+#else
+#  define NIR_SRC_INIT (nir_src) { { NULL } }
+#endif
  
  #define nir_foreach_use(reg_or_ssa_def, src) \
     list_for_each_entry(nir_src, src, &(reg_or_ssa_def)->uses, use_link)
@@ -538,7 +564,11 @@ typedef struct {
     bool is_ssa;
  } nir_dest;
  
-#define NIR_DEST_INIT (nir_dest) { { { NULL } } }
+#ifdef __cplusplus
+#  define NIR_DEST_INIT nir_dest()
+#else
+#  define NIR_DEST_INIT (nir_dest) { { { NULL } } }
+#endif
  
  #define nir_foreach_def(reg, dest) \
     list_for_each_entry(nir_dest, dest, &(reg)->defs, reg.def_link)
@@ -794,7 +824,7 @@ typedef struct {
     nir_deref_var **params;
     nir_deref_var *return_deref;
  
-   struct nir_function_overload *callee;
+   struct nir_function *callee;
  } nir_call_instr;
  
  #define INTRINSIC(name, num_srcs, src_components, has_dest, dest_components, \
@@ -935,6 +965,7 @@ typedef enum {
     nir_tex_src_ms_index, /* MSAA sample index */
     nir_tex_src_ddx,
     nir_tex_src_ddy,
+   nir_tex_src_texture_offset, /* < dynamically uniform indirect offset */
     nir_tex_src_sampler_offset, /* < dynamically uniform indirect offset */
     nir_num_tex_src_types
  } nir_tex_src_type;
@@ -985,6 +1016,24 @@ typedef struct {
     /* gather component selector */
     unsigned component : 2;
  
+   /** The texture index
+    *
+    * If this texture instruction has a nir_tex_src_texture_offset source,
+    * then the texture index is given by texture_index + texture_offset.
+    */
+   unsigned texture_index;
+
+   /** The size of the texture array or 0 if it's not an array */
+   unsigned texture_array_size;
+
+   /** The texture deref
+    *
+    * If both this and `sampler` are both NULL, use texture_index instead.
+    * If `texture` is NULL, but `sampler` is non-NULL, then the texture is
+    * implied from the sampler.
+    */
+   nir_deref_var *texture;
+
     /** The sampler index
      *
      * If this texture instruction has a nir_tex_src_sampler_offset source,
@@ -992,10 +1041,11 @@ typedef struct {
      */
     unsigned sampler_index;
  
-   /** The size of the sampler array or 0 if it's not an array */
-   unsigned sampler_array_size;
-
-   nir_deref_var *sampler; /* if this is NULL, use sampler_index instead */
+   /** The sampler deref
+    *
+    * If this is null, use sampler_index instead.
+    */
+   nir_deref_var *sampler;
  } nir_tex_instr;
  
  static inline unsigned
@@ -1348,8 +1398,8 @@ typedef enum {
  typedef struct {
     nir_cf_node cf_node;
  
-   /** pointer to the overload of which this is an implementation */
-   struct nir_function_overload *overload;
+   /** pointer to the function of which this is an implementation */
+   struct nir_function *function;
  
     struct exec_list body; /** < list of nir_cf_node */
  
@@ -1434,37 +1484,35 @@ typedef struct {
     const struct glsl_type *type;
  } nir_parameter;
  
-typedef struct nir_function_overload {
+typedef struct nir_function {
     struct exec_node node;
  
+   const char *name;
+   struct nir_shader *shader;
+
     unsigned num_params;
     nir_parameter *params;
     const struct glsl_type *return_type;
  
-   nir_function_impl *impl; /** < NULL if the overload is only declared yet */
-
-   /** pointer to the function of which this is an overload */
-   struct nir_function *function;
-} nir_function_overload;
-
-typedef struct nir_function {
-   struct exec_node node;
-
-   struct exec_list overload_list; /** < list of nir_function_overload */
-   const char *name;
-   struct nir_shader *shader;
+   /** The implementation of this function.
+    *
+    * If the function is only declared and not implemented, this is NULL.
+    */
+   nir_function_impl *impl;
  } nir_function;
  
-#define nir_function_first_overload(func) \
-   exec_node_data(nir_function_overload, \
-                  exec_list_get_head(&(func)->overload_list), node)
-
  typedef struct nir_shader_compiler_options {
+   bool lower_fdiv;
     bool lower_ffma;
     bool lower_flrp;
     bool lower_fpow;
     bool lower_fsat;
     bool lower_fsqrt;
+   bool lower_fmod;
+   bool lower_bitfield_extract;
+   bool lower_bitfield_insert;
+   bool lower_uadd_carry;
+   bool lower_usub_borrow;
     /** lowers fneg and ineg to fsub and isub. */
     bool lower_negate;
     /** lowers fsub and isub to fadd+fneg and iadd+ineg. */
@@ -1487,6 +1535,9 @@ typedef struct nir_shader_compiler_options {
      * are simulated by floats.)
      */
     bool native_integers;
+
+   /* Indicates that the driver only has zero-based vertex id */
+   bool vertex_id_zero_based;
  } nir_shader_compiler_options;
  
  typedef struct nir_shader_info {
@@ -1585,6 +1636,9 @@ typedef struct nir_shader {
     /** list of outputs (nir_variable) */
     struct exec_list outputs;
  
+   /** list of shared compute variables (nir_variable) */
+   struct exec_list shared;
+
     /** Set of driver-specific options for the shader.
      *
      * The memory for the options is expected to be kept in a single static
@@ -1613,16 +1667,14 @@ typedef struct nir_shader {
      * the highest index a load_input_*, load_uniform_*, etc. intrinsic can
      * access plus one
      */
-   unsigned num_inputs, num_uniforms, num_outputs;
+   unsigned num_inputs, num_uniforms, num_outputs, num_shared;
  
     /** The shader stage, such as MESA_SHADER_VERTEX. */
     gl_shader_stage stage;
  } nir_shader;
  
-#define nir_foreach_overload(shader, overload)                        \
-   foreach_list_typed(nir_function, func, node, &(shader)->functions) \
-      foreach_list_typed(nir_function_overload, overload, node, \
-                         &(func)->overload_list)
+#define nir_foreach_function(shader, func) \
+   foreach_list_typed(nir_function, func, node, &(shader)->functions)
  
  nir_shader *nir_shader_create(void *mem_ctx,
                                gl_shader_stage stage,
@@ -1658,10 +1710,9 @@ nir_variable *nir_local_variable_create(nir_function_impl *impl,
  /** creates a function and adds it to the shader's list of functions */
  nir_function *nir_function_create(nir_shader *shader, const char *name);
  
-/** creates a null function returning null */
-nir_function_overload *nir_function_overload_create(nir_function *func);
-
-nir_function_impl *nir_function_impl_create(nir_function_overload *func);
+nir_function_impl *nir_function_impl_create(nir_function *func);
+/** creates a function_impl that isn't tied to any particular function */
+nir_function_impl *nir_function_impl_create_bare(nir_shader *shader);
  
  nir_block *nir_block_create(nir_shader *shader);
  nir_if *nir_if_create(nir_shader *shader);
@@ -1686,7 +1737,7 @@ nir_intrinsic_instr *nir_intrinsic_instr_create(nir_shader *shader,
                                                  nir_intrinsic_op op);
  
  nir_call_instr *nir_call_instr_create(nir_shader *shader,
-                                      nir_function_overload *callee);
+                                      nir_function *callee);
  
  nir_tex_instr *nir_tex_instr_create(nir_shader *shader, unsigned num_srcs);
  
@@ -1731,6 +1782,19 @@ typedef struct {
     };
  } nir_cursor;
  
+static inline nir_block *
+nir_cursor_current_block(nir_cursor cursor)
+{
+   if (cursor.option == nir_cursor_before_instr ||
+       cursor.option == nir_cursor_after_instr) {
+      return cursor.instr->block;
+   } else {
+      return cursor.block;
+   }
+}
+
+bool nir_cursors_equal(nir_cursor a, nir_cursor b);
+
  static inline nir_cursor
  nir_before_block(nir_block *block)
  {
@@ -1797,6 +1861,22 @@ nir_after_cf_node(nir_cf_node *node)
  }
  
  static inline nir_cursor
+nir_after_cf_node_and_phis(nir_cf_node *node)
+{
+   if (node->type == nir_cf_node_block)
+      return nir_after_block(nir_cf_node_as_block(node));
+
+   nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node));
+   assert(block->cf_node.type == nir_cf_node_block);
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_phi)
+         return nir_before_instr(instr);
+   }
+   return nir_after_block(block);
+}
+
+static inline nir_cursor
  nir_before_cf_list(struct exec_list *cf_list)
  {
     nir_cf_node *first_node = exec_node_data(nir_cf_node,
@@ -1922,18 +2002,54 @@ void nir_index_blocks(nir_function_impl *impl);
  void nir_print_shader(nir_shader *shader, FILE *fp);
  void nir_print_instr(const nir_instr *instr, FILE *fp);
  
-nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
+nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
+nir_function_impl *nir_function_impl_clone(const nir_function_impl *impl);
+nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
  
  #ifdef DEBUG
  void nir_validate_shader(nir_shader *shader);
  void nir_metadata_set_validation_flag(nir_shader *shader);
  void nir_metadata_check_validation_flag(nir_shader *shader);
+
+#include "util/debug.h"
+static inline bool
+should_clone_nir(void)
+{
+   static int should_clone = -1;
+   if (should_clone < 0)
+      should_clone = env_var_as_boolean("NIR_TEST_CLONE", false);
+
+   return should_clone;
+}
  #else
  static inline void nir_validate_shader(nir_shader *shader) { (void) shader; }
  static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
  static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
+static inline bool should_clone_nir(void) { return false; }
  #endif /* DEBUG */
  
+#define _PASS(nir, do_pass) do {                                     \
+   do_pass                                                           \
+   nir_validate_shader(nir);                                         \
+   if (should_clone_nir()) {                                         \
+      nir_shader *clone = nir_shader_clone(ralloc_parent(nir), nir); \
+      ralloc_free(nir);                                              \
+      nir = clone;                                                   \
+   }                                                                 \
+} while (0)
+
+#define NIR_PASS(progress, nir, pass, ...) _PASS(nir,                \
+   nir_metadata_set_validation_flag(nir);                            \
+   if (pass(nir, ##__VA_ARGS__)) {                                   \
+      progress = true;                                               \
+      nir_metadata_check_validation_flag(nir);                       \
+   }                                                                 \
+)
+
+#define NIR_PASS_V(nir, pass, ...) _PASS(nir,                        \
+   pass(nir, ##__VA_ARGS__);                                         \
+)
+
  void nir_calc_dominance_impl(nir_function_impl *impl);
  void nir_calc_dominance(nir_shader *shader);
  
@@ -1953,14 +2069,24 @@ int nir_gs_count_vertices(const nir_shader *shader);
  
  bool nir_split_var_copies(nir_shader *shader);
  
+bool nir_lower_returns_impl(nir_function_impl *impl);
+bool nir_lower_returns(nir_shader *shader);
+
+bool nir_inline_functions(nir_shader *shader);
+
  void nir_lower_var_copy_instr(nir_intrinsic_instr *copy, void *mem_ctx);
  void nir_lower_var_copies(nir_shader *shader);
  
  bool nir_lower_global_vars_to_local(nir_shader *shader);
  
+bool nir_lower_indirect_derefs(nir_shader *shader, uint32_t mode_mask);
+
  bool nir_lower_locals_to_regs(nir_shader *shader);
  
-void nir_lower_outputs_to_temporaries(nir_shader *shader);
+void nir_lower_outputs_to_temporaries(nir_shader *shader,
+                                      nir_function *entrypoint);
+
+void nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint);
  
  void nir_assign_var_locations(struct exec_list *var_list,
                                unsigned *size,
@@ -1974,7 +2100,7 @@ nir_src *nir_get_io_vertex_index_src(nir_intrinsic_instr *instr);
  
  void nir_lower_vars_to_ssa(nir_shader *shader);
  
-bool nir_remove_dead_variables(nir_shader *shader);
+bool nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode);
  
  void nir_move_vec_src_uses_to_dest(nir_shader *shader);
  bool nir_lower_vec_to_movs(nir_shader *shader);
@@ -2058,6 +2184,9 @@ bool nir_ssa_defs_interfere(nir_ssa_def *a, nir_ssa_def *b);
  void nir_convert_to_ssa_impl(nir_function_impl *impl);
  void nir_convert_to_ssa(nir_shader *shader);
  
+bool nir_repair_ssa_impl(nir_function_impl *impl);
+bool nir_repair_ssa(nir_shader *shader);
+
  /* If phi_webs_only is true, only convert SSA values involved in phi nodes to
   * registers.  If false, convert all values (even those not involved in a phi
   * node) to registers.
diff --git a/src/glsl/nir/nir_algebraic.py b/src/glsl/nir/nir_algebraic.py

index bbf4f08..14c0e82 100644 (file)
--- a/src/glsl/nir/nir_algebraic.py
+++ b/src/glsl/nir/nir_algebraic.py
@@ -108,7 +108,7 @@ class Constant(Value):
        if isinstance(self.value, (bool)):
           return 'NIR_TRUE' if self.value else 'NIR_FALSE'
        if isinstance(self.value, (int, long)):
-         return hex(struct.unpack('I', struct.pack('i', self.value))[0])
+         return hex(struct.unpack('I', struct.pack('i' if self.value < 0 else 'I', self.value))[0])
        elif isinstance(self.value, float):
           return hex(struct.unpack('I', struct.pack('f', self.value))[0])
        else:
@@ -276,9 +276,9 @@ ${pass_name}(nir_shader *shader)
     condition_flags[${index}] = ${condition};
     % endfor
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress |= ${pass_name}_impl(overload->impl, condition_flags);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress |= ${pass_name}_impl(function->impl, condition_flags);
     }
  
     return progress;
diff --git a/src/glsl/nir/nir_builder.h b/src/glsl/nir/nir_builder.h

index b909f48..e842b22 100644 (file)
--- a/src/glsl/nir/nir_builder.h
+++ b/src/glsl/nir/nir_builder.h
@@ -40,7 +40,18 @@ nir_builder_init(nir_builder *build, nir_function_impl *impl)
  {
     memset(build, 0, sizeof(*build));
     build->impl = impl;
-   build->shader = impl->overload->function->shader;
+   build->shader = impl->function->shader;
+}
+
+static inline void
+nir_builder_init_simple_shader(nir_builder *build, void *mem_ctx,
+                               gl_shader_stage stage,
+                               const nir_shader_compiler_options *options)
+{
+   build->shader = nir_shader_create(mem_ctx, stage, options);
+   nir_function *func = nir_function_create(build->shader, "main");
+   build->impl = nir_function_impl_create(func);
+   build->cursor = nir_after_cf_list(&build->impl->body);
  }
  
  static inline void
@@ -59,6 +70,20 @@ nir_builder_cf_insert(nir_builder *build, nir_cf_node *cf)
  }
  
  static inline nir_ssa_def *
+nir_ssa_undef(nir_builder *build, unsigned num_components)
+{
+   nir_ssa_undef_instr *undef =
+      nir_ssa_undef_instr_create(build->shader, num_components);
+   if (!undef)
+      return NULL;
+
+   nir_instr_insert(nir_before_block(nir_start_block(build->impl)),
+                    &undef->instr);
+
+   return &undef->def;
+}
+
+static inline nir_ssa_def *
  nir_build_imm(nir_builder *build, unsigned num_components, nir_const_value value)
  {
     nir_load_const_instr *load_const =
@@ -249,6 +274,23 @@ nir_swizzle(nir_builder *build, nir_ssa_def *src, unsigned swiz[4],
                       nir_imov_alu(build, alu_src, num_components);
  }
  
+/* Selects the right fdot given the number of components in each source. */
+static inline nir_ssa_def *
+nir_fdot(nir_builder *build, nir_ssa_def *src0, nir_ssa_def *src1)
+{
+   assert(src0->num_components == src1->num_components);
+   switch (src0->num_components) {
+   case 1: return nir_fmul(build, src0, src1);
+   case 2: return nir_fdot2(build, src0, src1);
+   case 3: return nir_fdot3(build, src0, src1);
+   case 4: return nir_fdot4(build, src0, src1);
+   default:
+      unreachable("bad component size");
+   }
+
+   return NULL;
+}
+
  static inline nir_ssa_def *
  nir_channel(nir_builder *b, nir_ssa_def *def, unsigned c)
  {
@@ -310,16 +352,76 @@ nir_load_var(nir_builder *build, nir_variable *var)
  }
  
  static inline void
-nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value)
+nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value,
+              unsigned writemask)
  {
     const unsigned num_components = glsl_get_vector_elements(var->type);
  
     nir_intrinsic_instr *store =
        nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_var);
     store->num_components = num_components;
+   store->const_index[0] = writemask;
     store->variables[0] = nir_deref_var_create(store, var);
     store->src[0] = nir_src_for_ssa(value);
     nir_builder_instr_insert(build, &store->instr);
  }
  
+static inline void
+nir_store_deref_var(nir_builder *build, nir_deref_var *deref,
+                    nir_ssa_def *value, unsigned writemask)
+{
+   const unsigned num_components =
+      glsl_get_vector_elements(nir_deref_tail(&deref->deref)->type);
+
+   nir_intrinsic_instr *store =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_var);
+   store->num_components = num_components;
+   store->const_index[0] = writemask & ((1 << num_components) - 1);
+   store->variables[0] = nir_deref_as_var(nir_copy_deref(store, &deref->deref));
+   store->src[0] = nir_src_for_ssa(value);
+   nir_builder_instr_insert(build, &store->instr);
+}
+
+static inline void
+nir_copy_deref_var(nir_builder *build, nir_deref_var *dest, nir_deref_var *src)
+{
+   assert(nir_deref_tail(&dest->deref)->type ==
+          nir_deref_tail(&src->deref)->type);
+
+   nir_intrinsic_instr *copy =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_copy_var);
+   copy->variables[0] = nir_deref_as_var(nir_copy_deref(copy, &dest->deref));
+   copy->variables[1] = nir_deref_as_var(nir_copy_deref(copy, &src->deref));
+   nir_builder_instr_insert(build, &copy->instr);
+}
+
+static inline void
+nir_copy_var(nir_builder *build, nir_variable *dest, nir_variable *src)
+{
+   nir_intrinsic_instr *copy =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_copy_var);
+   copy->variables[0] = nir_deref_var_create(copy, dest);
+   copy->variables[1] = nir_deref_var_create(copy, src);
+   nir_builder_instr_insert(build, &copy->instr);
+}
+
+static inline nir_ssa_def *
+nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index)
+{
+   nir_intrinsic_instr *load = nir_intrinsic_instr_create(build->shader, op);
+   load->num_components = nir_intrinsic_infos[op].dest_components;
+   load->const_index[0] = index;
+   nir_ssa_dest_init(&load->instr, &load->dest,
+                     nir_intrinsic_infos[op].dest_components, NULL);
+   nir_builder_instr_insert(build, &load->instr);
+   return &load->dest.ssa;
+}
+
+static inline void
+nir_jump(nir_builder *build, nir_jump_type jump_type)
+{
+   nir_jump_instr *jump = nir_jump_instr_create(build->shader, jump_type);
+   nir_builder_instr_insert(build, &jump->instr);
+}
+
  #endif /* NIR_BUILDER_H */
diff --git a/src/glsl/nir/nir_clone.c b/src/glsl/nir/nir_clone.c

index 33ff526..bc6df56 100644 (file)
--- a/src/glsl/nir/nir_clone.c
+++ b/src/glsl/nir/nir_clone.c
@@ -32,8 +32,11 @@
   */
  
  typedef struct {
+   /* True if we are cloning an entire shader. */
+   bool global_clone;
+
     /* maps orig ptr -> cloned ptr: */
-   struct hash_table *ptr_table;
+   struct hash_table *remap_table;
  
     /* List of phi sources. */
     struct list_head phi_srcs;
@@ -43,28 +46,32 @@ typedef struct {
  } clone_state;
  
  static void
-init_clone_state(clone_state *state)
+init_clone_state(clone_state *state, bool global)
  {
-   state->ptr_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-                                              _mesa_key_pointer_equal);
+   state->global_clone = global;
+   state->remap_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                                _mesa_key_pointer_equal);
     list_inithead(&state->phi_srcs);
  }
  
  static void
  free_clone_state(clone_state *state)
  {
-   _mesa_hash_table_destroy(state->ptr_table, NULL);
+   _mesa_hash_table_destroy(state->remap_table, NULL);
  }
  
-static void *
-lookup_ptr(clone_state *state, const void *ptr)
+static inline void *
+_lookup_ptr(clone_state *state, const void *ptr, bool global)
  {
     struct hash_entry *entry;
  
     if (!ptr)
        return NULL;
  
-   entry = _mesa_hash_table_search(state->ptr_table, ptr);
+   if (!state->global_clone && global)
+      return (void *)ptr;
+
+   entry = _mesa_hash_table_search(state->remap_table, ptr);
     assert(entry && "Failed to find pointer!");
     if (!entry)
        return NULL;
@@ -73,13 +80,37 @@ lookup_ptr(clone_state *state, const void *ptr)
  }
  
  static void
-store_ptr(clone_state *state, void *nptr, const void *ptr)
+add_remap(clone_state *state, void *nptr, const void *ptr)
+{
+   _mesa_hash_table_insert(state->remap_table, ptr, nptr);
+}
+
+static void *
+remap_local(clone_state *state, const void *ptr)
+{
+   return _lookup_ptr(state, ptr, false);
+}
+
+static void *
+remap_global(clone_state *state, const void *ptr)
+{
+   return _lookup_ptr(state, ptr, true);
+}
+
+static nir_register *
+remap_reg(clone_state *state, const nir_register *reg)
+{
+   return _lookup_ptr(state, reg, reg->is_global);
+}
+
+static nir_variable *
+remap_var(clone_state *state, const nir_variable *var)
  {
-   _mesa_hash_table_insert(state->ptr_table, ptr, nptr);
+   return _lookup_ptr(state, var, var->data.mode != nir_var_local);
  }
  
-static nir_constant *
-clone_constant(clone_state *state, const nir_constant *c, nir_variable *nvar)
+nir_constant *
+nir_constant_clone(const nir_constant *c, nir_variable *nvar)
  {
     nir_constant *nc = ralloc(nvar, nir_constant);
  
@@ -87,7 +118,7 @@ clone_constant(clone_state *state, const nir_constant *c, nir_variable *nvar)
     nc->num_elements = c->num_elements;
     nc->elements = ralloc_array(nvar, nir_constant *, c->num_elements);
     for (unsigned i = 0; i < c->num_elements; i++) {
-      nc->elements[i] = clone_constant(state, c->elements[i], nvar);
+      nc->elements[i] = nir_constant_clone(c->elements[i], nvar);
     }
  
     return nc;
@@ -100,7 +131,7 @@ static nir_variable *
  clone_variable(clone_state *state, const nir_variable *var)
  {
     nir_variable *nvar = rzalloc(state->ns, nir_variable);
-   store_ptr(state, nvar, var);
+   add_remap(state, nvar, var);
  
     nvar->type = var->type;
     nvar->name = ralloc_strdup(nvar, var->name);
@@ -111,7 +142,7 @@ clone_variable(clone_state *state, const nir_variable *var)
            var->num_state_slots * sizeof(nir_state_slot));
     if (var->constant_initializer) {
        nvar->constant_initializer =
-         clone_constant(state, var->constant_initializer, nvar);
+         nir_constant_clone(var->constant_initializer, nvar);
     }
     nvar->interface_type = var->interface_type;
  
@@ -137,7 +168,7 @@ static nir_register *
  clone_register(clone_state *state, const nir_register *reg)
  {
     nir_register *nreg = rzalloc(state->ns, nir_register);
-   store_ptr(state, nreg, reg);
+   add_remap(state, nreg, reg);
  
     nreg->num_components = reg->num_components;
     nreg->num_array_elems = reg->num_array_elems;
@@ -172,9 +203,9 @@ __clone_src(clone_state *state, void *ninstr_or_if,
  {
     nsrc->is_ssa = src->is_ssa;
     if (src->is_ssa) {
-      nsrc->ssa = lookup_ptr(state, src->ssa);
+      nsrc->ssa = remap_local(state, src->ssa);
     } else {
-      nsrc->reg.reg = lookup_ptr(state, src->reg.reg);
+      nsrc->reg.reg = remap_reg(state, src->reg.reg);
        if (src->reg.indirect) {
           nsrc->reg.indirect = ralloc(ninstr_or_if, nir_src);
           __clone_src(state, ninstr_or_if, nsrc->reg.indirect, src->reg.indirect);
@@ -190,9 +221,9 @@ __clone_dst(clone_state *state, nir_instr *ninstr,
     ndst->is_ssa = dst->is_ssa;
     if (dst->is_ssa) {
        nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, dst->ssa.name);
-      store_ptr(state, &ndst->ssa, &dst->ssa);
+      add_remap(state, &ndst->ssa, &dst->ssa);
     } else {
-      ndst->reg.reg = lookup_ptr(state, dst->reg.reg);
+      ndst->reg.reg = remap_reg(state, dst->reg.reg);
        if (dst->reg.indirect) {
           ndst->reg.indirect = ralloc(ninstr, nir_src);
           __clone_src(state, ninstr, ndst->reg.indirect, dst->reg.indirect);
@@ -208,7 +239,7 @@ static nir_deref_var *
  clone_deref_var(clone_state *state, const nir_deref_var *dvar,
                  nir_instr *ninstr)
  {
-   nir_variable *nvar = lookup_ptr(state, dvar->var);
+   nir_variable *nvar = remap_var(state, dvar->var);
     nir_deref_var *ndvar = nir_deref_var_create(ninstr, nvar);
  
     if (dvar->deref.child)
@@ -322,7 +353,7 @@ clone_load_const(clone_state *state, const nir_load_const_instr *lc)
  
     memcpy(&nlc->value, &lc->value, sizeof(nlc->value));
  
-   store_ptr(state, &nlc->def, &lc->def);
+   add_remap(state, &nlc->def, &lc->def);
  
     return nlc;
  }
@@ -333,7 +364,7 @@ clone_ssa_undef(clone_state *state, const nir_ssa_undef_instr *sa)
     nir_ssa_undef_instr *nsa =
        nir_ssa_undef_instr_create(state->ns, sa->def.num_components);
  
-   store_ptr(state, &nsa->def, &sa->def);
+   add_remap(state, &nsa->def, &sa->def);
  
     return nsa;
  }
@@ -357,8 +388,11 @@ clone_tex(clone_state *state, const nir_tex_instr *tex)
     ntex->is_new_style_shadow = tex->is_new_style_shadow;
     memcpy(ntex->const_offset, tex->const_offset, sizeof(ntex->const_offset));
     ntex->component = tex->component;
+   ntex->texture_index = tex->texture_index;
+   ntex->texture_array_size = tex->texture_array_size;
+   if (tex->texture)
+      ntex->texture = clone_deref_var(state, tex->texture, &ntex->instr);
     ntex->sampler_index = tex->sampler_index;
-   ntex->sampler_array_size = tex->sampler_array_size;
     if (tex->sampler)
        ntex->sampler = clone_deref_var(state, tex->sampler, &ntex->instr);
  
@@ -420,7 +454,7 @@ clone_jump(clone_state *state, const nir_jump_instr *jmp)
  static nir_call_instr *
  clone_call(clone_state *state, const nir_call_instr *call)
  {
-   nir_function_overload *ncallee = lookup_ptr(state, call->callee);
+   nir_function *ncallee = remap_global(state, call->callee);
     nir_call_instr *ncall = nir_call_instr_create(state->ns, ncallee);
  
     for (unsigned i = 0; i < ncall->num_params; i++)
@@ -473,7 +507,7 @@ clone_block(clone_state *state, struct exec_list *cf_list, const nir_block *blk)
     assert(exec_list_is_empty(&nblk->instr_list));
  
     /* We need this for phi sources */
-   store_ptr(state, nblk, blk);
+   add_remap(state, nblk, blk);
  
     nir_foreach_instr(blk, instr) {
        if (instr->type == nir_instr_type_phi) {
@@ -546,10 +580,9 @@ clone_cf_list(clone_state *state, struct exec_list *dst,
  }
  
  static nir_function_impl *
-clone_function_impl(clone_state *state, const nir_function_impl *fi,
-                    nir_function_overload *nfo)
+clone_function_impl(clone_state *state, const nir_function_impl *fi)
  {
-   nir_function_impl *nfi = nir_function_impl_create(nfo);
+   nir_function_impl *nfi = nir_function_impl_create_bare(state->ns);
  
     clone_var_list(state, &nfi->locals, &fi->locals);
     clone_reg_list(state, &nfi->registers, &fi->registers);
@@ -558,9 +591,9 @@ clone_function_impl(clone_state *state, const nir_function_impl *fi,
     nfi->num_params = fi->num_params;
     nfi->params = ralloc_array(state->ns, nir_variable *, fi->num_params);
     for (unsigned i = 0; i < fi->num_params; i++) {
-      nfi->params[i] = lookup_ptr(state, fi->params[i]);
+      nfi->params[i] = remap_local(state, fi->params[i]);
     }
-   nfi->return_var = lookup_ptr(state, fi->return_var);
+   nfi->return_var = remap_local(state, fi->return_var);
  
     assert(list_empty(&state->phi_srcs));
  
@@ -572,9 +605,9 @@ clone_function_impl(clone_state *state, const nir_function_impl *fi,
      * add it to the phi_srcs list and we fix it up here.
      */
     list_for_each_entry_safe(nir_phi_src, src, &state->phi_srcs, src.use_link) {
-      src->pred = lookup_ptr(state, src->pred);
+      src->pred = remap_local(state, src->pred);
        assert(src->src.is_ssa);
-      src->src.ssa = lookup_ptr(state, src->src.ssa);
+      src->src.ssa = remap_local(state, src->src.ssa);
  
        /* Remove from this list and place in the uses of the SSA def */
        list_del(&src->src.use_link);
@@ -588,28 +621,20 @@ clone_function_impl(clone_state *state, const nir_function_impl *fi,
     return nfi;
  }
  
-static nir_function_overload *
-clone_function_overload(clone_state *state, const nir_function_overload *fo,
-                        nir_function *nfxn)
+nir_function_impl *
+nir_function_impl_clone(const nir_function_impl *fi)
  {
-   nir_function_overload *nfo = nir_function_overload_create(nfxn);
-
-   /* Needed for call instructions */
-   store_ptr(state, nfo, fo);
+   clone_state state;
+   init_clone_state(&state, false);
  
-   nfo->num_params = fo->num_params;
-   nfo->params = ralloc_array(state->ns, nir_parameter, fo->num_params);
-   memcpy(nfo->params, fo->params, sizeof(nir_parameter) * fo->num_params);
+   /* We use the same shader */
+   state.ns = fi->function->shader;
  
-   nfo->return_type = fo->return_type;
+   nir_function_impl *nfi = clone_function_impl(&state, fi);
  
-   /* At first glance, it looks like we should clone the function_impl here.
-    * However, call instructions need to be able to reference at least the
-    * overload and those will get processed as we clone the function_impl's.
-    * We stop here and do function_impls as a second pass.
-    */
+   free_clone_state(&state);
  
-   return nfo;
+   return nfi;
  }
  
  static nir_function *
@@ -618,8 +643,20 @@ clone_function(clone_state *state, const nir_function *fxn, nir_shader *ns)
     assert(ns == state->ns);
     nir_function *nfxn = nir_function_create(ns, fxn->name);
  
-   foreach_list_typed(nir_function_overload, fo, node, &fxn->overload_list)
-      clone_function_overload(state, fo, nfxn);
+   /* Needed for call instructions */
+   add_remap(state, nfxn, fxn);
+
+   nfxn->num_params = fxn->num_params;
+   nfxn->params = ralloc_array(state->ns, nir_parameter, fxn->num_params);
+   memcpy(nfxn->params, fxn->params, sizeof(nir_parameter) * fxn->num_params);
+
+   nfxn->return_type = fxn->return_type;
+
+   /* At first glance, it looks like we should clone the function_impl here.
+    * However, call instructions need to be able to reference at least the
+    * function and those will get processed as we clone the function_impl's.
+    * We stop here and do function_impls as a second pass.
+    */
  
     return nfxn;
  }
@@ -628,7 +665,7 @@ nir_shader *
  nir_shader_clone(void *mem_ctx, const nir_shader *s)
  {
     clone_state state;
-   init_clone_state(&state);
+   init_clone_state(&state, true);
  
     nir_shader *ns = nir_shader_create(mem_ctx, s->stage, s->options);
     state.ns = ns;
@@ -636,21 +673,23 @@ nir_shader_clone(void *mem_ctx, const nir_shader *s)
     clone_var_list(&state, &ns->uniforms, &s->uniforms);
     clone_var_list(&state, &ns->inputs,   &s->inputs);
     clone_var_list(&state, &ns->outputs,  &s->outputs);
+   clone_var_list(&state, &ns->shared,   &s->shared);
     clone_var_list(&state, &ns->globals,  &s->globals);
     clone_var_list(&state, &ns->system_values, &s->system_values);
  
-   /* Go through and clone functions and overloads */
+   /* Go through and clone functions */
     foreach_list_typed(nir_function, fxn, node, &s->functions)
        clone_function(&state, fxn, ns);
  
-   /* Only after all overloads are cloned can we clone the actual function
+   /* Only after all functions are cloned can we clone the actual function
      * implementations.  This is because nir_call_instr's need to reference the
-    * overloads of other functions and we don't know what order the functions
+    * functions of other functions and we don't know what order the functions
      * will have in the list.
      */
-   nir_foreach_overload(s, fo) {
-      nir_function_overload *nfo = lookup_ptr(&state, fo);
-      clone_function_impl(&state, fo->impl, nfo);
+   nir_foreach_function(s, fxn) {
+      nir_function *nfxn = remap_global(&state, fxn);
+      nfxn->impl = clone_function_impl(&state, fxn->impl);
+      nfxn->impl->function = nfxn;
     }
  
     clone_reg_list(&state, &ns->registers, &s->registers);
@@ -664,6 +703,7 @@ nir_shader_clone(void *mem_ctx, const nir_shader *s)
     ns->num_inputs = s->num_inputs;
     ns->num_uniforms = s->num_uniforms;
     ns->num_outputs = s->num_outputs;
+   ns->num_shared = s->num_shared;
  
     free_clone_state(&state);
  
diff --git a/src/glsl/nir/nir_control_flow.c b/src/glsl/nir/nir_control_flow.c

index 96395a4..33b06d0 100644 (file)
--- a/src/glsl/nir/nir_control_flow.c
+++ b/src/glsl/nir/nir_control_flow.c
@@ -336,8 +336,7 @@ block_add_normal_succs(nir_block *block)
           nir_block *next_block = nir_cf_node_as_block(next);
  
           link_blocks(block, next_block, NULL);
-      } else {
-         assert(parent->type == nir_cf_node_loop);
+      } else if (parent->type == nir_cf_node_loop) {
           nir_loop *loop = nir_cf_node_as_loop(parent);
  
           nir_cf_node *head = nir_loop_first_cf_node(loop);
@@ -346,6 +345,10 @@ block_add_normal_succs(nir_block *block)
  
           link_blocks(block, head_block, NULL);
           insert_phi_undef(head_block, block);
+      } else {
+         assert(parent->type == nir_cf_node_function);
+         nir_function_impl *impl = nir_cf_node_as_function(parent);
+         link_blocks(block, impl->end_block, NULL);
        }
     } else {
        nir_cf_node *next = nir_cf_node_next(&block->cf_node);
@@ -746,6 +749,12 @@ nir_cf_extract(nir_cf_list *extracted, nir_cursor begin, nir_cursor end)
  {
     nir_block *block_begin, *block_end, *block_before, *block_after;
  
+   if (nir_cursors_equal(begin, end)) {
+      exec_list_make_empty(&extracted->list);
+      extracted->impl = NULL; /* we shouldn't need this */
+      return;
+   }
+
     /* In the case where begin points to an instruction in some basic block and
      * end points to the end of the same basic block, we rely on the fact that
      * splitting on an instruction moves earlier instructions into a new basic
@@ -785,6 +794,9 @@ nir_cf_reinsert(nir_cf_list *cf_list, nir_cursor cursor)
  {
     nir_block *before, *after;
  
+   if (exec_list_is_empty(&cf_list->list))
+      return;
+
     split_block_cursor(cursor, &before, &after);
  
     foreach_list_typed_safe(nir_cf_node, node, node, &cf_list->list) {
diff --git a/src/glsl/nir/nir_dominance.c b/src/glsl/nir/nir_dominance.c

index af4caae..d95f396 100644 (file)
--- a/src/glsl/nir/nir_dominance.c
+++ b/src/glsl/nir/nir_dominance.c
@@ -94,7 +94,6 @@ calc_dominance_cb(nir_block *block, void *_state)
        }
     }
  
-   assert(new_idom);
     if (block->imm_dom != new_idom) {
        block->imm_dom = new_idom;
        state->progress = true;
@@ -112,6 +111,11 @@ calc_dom_frontier_cb(nir_block *block, void *state)
        struct set_entry *entry;
        set_foreach(block->predecessors, entry) {
           nir_block *runner = (nir_block *) entry->key;
+
+         /* Skip unreachable predecessors */
+         if (runner->imm_dom == NULL)
+            continue;
+
           while (runner != block->imm_dom) {
              _mesa_set_add(runner->dom_frontier, block);
              runner = runner->imm_dom;
@@ -221,9 +225,9 @@ nir_calc_dominance_impl(nir_function_impl *impl)
  void
  nir_calc_dominance(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_calc_dominance_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_calc_dominance_impl(function->impl);
     }
  }
  
@@ -277,7 +281,7 @@ dump_block_dom(nir_block *block, void *state)
  void
  nir_dump_dom_tree_impl(nir_function_impl *impl, FILE *fp)
  {
-   fprintf(fp, "digraph doms_%s {\n", impl->overload->function->name);
+   fprintf(fp, "digraph doms_%s {\n", impl->function->name);
     nir_foreach_block(impl, dump_block_dom, fp);
     fprintf(fp, "}\n\n");
  }
@@ -285,9 +289,9 @@ nir_dump_dom_tree_impl(nir_function_impl *impl, FILE *fp)
  void
  nir_dump_dom_tree(nir_shader *shader, FILE *fp)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_dump_dom_tree_impl(overload->impl, fp);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_dump_dom_tree_impl(function->impl, fp);
     }
  }
  
@@ -315,9 +319,9 @@ nir_dump_dom_frontier_impl(nir_function_impl *impl, FILE *fp)
  void
  nir_dump_dom_frontier(nir_shader *shader, FILE *fp)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_dump_dom_frontier_impl(overload->impl, fp);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_dump_dom_frontier_impl(function->impl, fp);
     }
  }
  
@@ -335,7 +339,7 @@ dump_block_succs(nir_block *block, void *state)
  void
  nir_dump_cfg_impl(nir_function_impl *impl, FILE *fp)
  {
-   fprintf(fp, "digraph cfg_%s {\n", impl->overload->function->name);
+   fprintf(fp, "digraph cfg_%s {\n", impl->function->name);
     nir_foreach_block(impl, dump_block_succs, fp);
     fprintf(fp, "}\n\n");
  }
@@ -343,8 +347,8 @@ nir_dump_cfg_impl(nir_function_impl *impl, FILE *fp)
  void
  nir_dump_cfg(nir_shader *shader, FILE *fp)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_dump_cfg_impl(overload->impl, fp);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_dump_cfg_impl(function->impl, fp);
     }
  }
diff --git a/src/glsl/nir/nir_from_ssa.c b/src/glsl/nir/nir_from_ssa.c

index f2797f7..8bc9f24 100644 (file)
--- a/src/glsl/nir/nir_from_ssa.c
+++ b/src/glsl/nir/nir_from_ssa.c
@@ -798,8 +798,8 @@ nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
  void
  nir_convert_from_ssa(nir_shader *shader, bool phi_webs_only)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_convert_from_ssa_impl(overload->impl, phi_webs_only);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_convert_from_ssa_impl(function->impl, phi_webs_only);
     }
  }
diff --git a/src/glsl/nir/nir_gather_info.c b/src/glsl/nir/nir_gather_info.c

new file mode 100644 (file)

index 0000000..b84915c
--- /dev/null
+++ b/src/glsl/nir/nir_gather_info.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+
+static void
+gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_discard:
+      assert(shader->stage == MESA_SHADER_FRAGMENT);
+      shader->info.fs.uses_discard = true;
+      break;
+
+   case nir_intrinsic_load_front_face:
+   case nir_intrinsic_load_vertex_id:
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_sample_id:
+   case nir_intrinsic_load_sample_pos:
+   case nir_intrinsic_load_sample_mask_in:
+   case nir_intrinsic_load_primitive_id:
+   case nir_intrinsic_load_invocation_id:
+   case nir_intrinsic_load_local_invocation_id:
+   case nir_intrinsic_load_work_group_id:
+   case nir_intrinsic_load_num_work_groups:
+      shader->info.system_values_read |=
+         (1 << nir_system_value_from_intrinsic(instr->intrinsic));
+      break;
+
+   case nir_intrinsic_end_primitive:
+   case nir_intrinsic_end_primitive_with_counter:
+      assert(shader->stage == MESA_SHADER_GEOMETRY);
+      shader->info.gs.uses_end_primitive = 1;
+      break;
+
+   default:
+      break;
+   }
+}
+
+static void
+gather_tex_info(nir_tex_instr *instr, nir_shader *shader)
+{
+   if (instr->op == nir_texop_tg4)
+      shader->info.uses_texture_gather = true;
+}
+
+static bool
+gather_info_block(nir_block *block, void *shader)
+{
+   nir_foreach_instr(block, instr) {
+      switch (instr->type) {
+      case nir_instr_type_intrinsic:
+         gather_intrinsic_info(nir_instr_as_intrinsic(instr), shader);
+         break;
+      case nir_instr_type_tex:
+         gather_tex_info(nir_instr_as_tex(instr), shader);
+         break;
+      case nir_instr_type_call:
+         assert(!"nir_shader_gather_info only works if functions are inlined");
+         break;
+      default:
+         break;
+      }
+   }
+
+   return true;
+}
+
+void
+nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint)
+{
+   shader->info.inputs_read = 0;
+   foreach_list_typed(nir_variable, var, node, &shader->inputs)
+      shader->info.inputs_read |= nir_variable_get_io_mask(var, shader->stage);
+
+   /* TODO: Some day we may need to add stream support to NIR */
+   shader->info.outputs_written = 0;
+   foreach_list_typed(nir_variable, var, node, &shader->outputs)
+      shader->info.outputs_written |= nir_variable_get_io_mask(var, shader->stage);
+
+   shader->info.system_values_read = 0;
+   foreach_list_typed(nir_variable, var, node, &shader->system_values)
+      shader->info.system_values_read |= nir_variable_get_io_mask(var, shader->stage);
+
+   nir_foreach_block(entrypoint, gather_info_block, shader);
+}
diff --git a/src/glsl/nir/nir_gs_count_vertices.c b/src/glsl/nir/nir_gs_count_vertices.c

index 1c36067..db15d16 100644 (file)
--- a/src/glsl/nir/nir_gs_count_vertices.c
+++ b/src/glsl/nir/nir_gs_count_vertices.c
@@ -55,15 +55,15 @@ nir_gs_count_vertices(const nir_shader *shader)
  {
     int count = -1;
  
-   nir_foreach_overload(shader, overload) {
-      if (!overload->impl)
+   nir_foreach_function(shader, function) {
+      if (!function->impl)
           continue;
  
        /* set_vertex_count intrinsics only appear in predecessors of the
         * end block.  So we don't need to walk all of them.
         */
        struct set_entry *entry;
-      set_foreach(overload->impl->end_block->predecessors, entry) {
+      set_foreach(function->impl->end_block->predecessors, entry) {
           nir_block *block = (nir_block *) entry->key;
  
           nir_foreach_instr_reverse(block, instr) {
diff --git a/src/glsl/nir/nir_inline_functions.c b/src/glsl/nir/nir_inline_functions.c

new file mode 100644 (file)

index 0000000..3cf8327
--- /dev/null
+++ b/src/glsl/nir/nir_inline_functions.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_control_flow.h"
+
+struct inline_functions_state {
+   struct set *inlined;
+   nir_builder builder;
+   bool progress;
+};
+
+static bool inline_function_impl(nir_function_impl *impl, struct set *inlined);
+
+static bool
+inline_functions_block(nir_block *block, void *void_state)
+{
+   struct inline_functions_state *state = void_state;
+
+   nir_builder *b = &state->builder;
+
+   /* This is tricky.  We're iterating over instructions in a block but, as
+    * we go, the block and its instruction list are being split into
+    * pieces.  However, this *should* be safe since foreach_safe always
+    * stashes the next thing in the iteration.  That next thing will
+    * properly get moved to the next block when it gets split, and we
+    * continue iterating there.
+    */
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_call)
+         continue;
+
+      state->progress = true;
+
+      nir_call_instr *call = nir_instr_as_call(instr);
+      assert(call->callee->impl);
+
+      inline_function_impl(call->callee->impl, state->inlined);
+
+      nir_function_impl *callee_copy =
+         nir_function_impl_clone(call->callee->impl);
+
+      exec_list_append(&b->impl->locals, &callee_copy->locals);
+      exec_list_append(&b->impl->registers, &callee_copy->registers);
+
+      b->cursor = nir_before_instr(&call->instr);
+
+      /* Add copies of all in parameters */
+      assert(call->num_params == callee_copy->num_params);
+      for (unsigned i = 0; i < callee_copy->num_params; i++) {
+         /* Only in or inout parameters */
+         if (call->callee->params[i].param_type == nir_parameter_out)
+            continue;
+
+         nir_copy_deref_var(b, nir_deref_var_create(b->shader,
+                                                    callee_copy->params[i]),
+                               call->params[i]);
+      }
+
+      /* Pluck the body out of the function and place it here */
+      nir_cf_list body;
+      nir_cf_list_extract(&body, &callee_copy->body);
+      nir_cf_reinsert(&body, b->cursor);
+
+      b->cursor = nir_before_instr(&call->instr);
+
+      /* Add copies of all out parameters and the return */
+      assert(call->num_params == callee_copy->num_params);
+      for (unsigned i = 0; i < callee_copy->num_params; i++) {
+         /* Only out or inout parameters */
+         if (call->callee->params[i].param_type == nir_parameter_in)
+            continue;
+
+         nir_copy_deref_var(b, call->params[i],
+                               nir_deref_var_create(b->shader,
+                                                    callee_copy->params[i]));
+      }
+      if (!glsl_type_is_void(call->callee->return_type)) {
+         nir_copy_deref_var(b, call->return_deref,
+                               nir_deref_var_create(b->shader,
+                                                    callee_copy->return_var));
+      }
+
+      nir_instr_remove(&call->instr);
+   }
+
+   return true;
+}
+
+static bool
+inline_function_impl(nir_function_impl *impl, struct set *inlined)
+{
+   if (_mesa_set_search(inlined, impl))
+      return false; /* Already inlined */
+
+   struct inline_functions_state state;
+
+   state.inlined = inlined;
+   state.progress = false;
+   nir_builder_init(&state.builder, impl);
+
+   nir_foreach_block(impl, inline_functions_block, &state);
+
+   if (state.progress) {
+      /* SSA and register indices are completely messed up now */
+      nir_index_ssa_defs(impl);
+      nir_index_local_regs(impl);
+
+      nir_metadata_preserve(impl, nir_metadata_none);
+   }
+
+   _mesa_set_add(inlined, impl);
+
+   return state.progress;
+}
+
+bool
+nir_inline_functions(nir_shader *shader)
+{
+   struct set *inlined = _mesa_set_create(NULL, _mesa_hash_pointer,
+                                          _mesa_key_pointer_equal);
+   bool progress = false;
+
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = inline_function_impl(function->impl, inlined) || progress;
+   }
+
+   _mesa_set_destroy(inlined, NULL);
+
+   return progress;
+}
diff --git a/src/glsl/nir/nir_instr_set.c b/src/glsl/nir/nir_instr_set.c

index d3f939f..eb02132 100644 (file)
--- a/src/glsl/nir/nir_instr_set.c
+++ b/src/glsl/nir/nir_instr_set.c
@@ -155,8 +155,9 @@ hash_tex(uint32_t hash, const nir_tex_instr *instr)
     hash = HASH(hash, instr->const_offset);
     unsigned component = instr->component;
     hash = HASH(hash, component);
+   hash = HASH(hash, instr->texture_index);
+   hash = HASH(hash, instr->texture_array_size);
     hash = HASH(hash, instr->sampler_index);
-   hash = HASH(hash, instr->sampler_array_size);
  
     assert(!instr->sampler);
  
@@ -305,13 +306,15 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
            memcmp(tex1->const_offset, tex2->const_offset,
                   sizeof(tex1->const_offset)) != 0 ||
            tex1->component != tex2->component ||
-         tex1->sampler_index != tex2->sampler_index ||
-         tex1->sampler_array_size != tex2->sampler_array_size) {
+         tex1->texture_index != tex2->texture_index ||
+         tex1->texture_array_size != tex2->texture_array_size ||
+         tex1->sampler_index != tex2->sampler_index) {
           return false;
        }
  
        /* Don't support un-lowered sampler derefs currently. */
-      assert(!tex1->sampler && !tex2->sampler);
+      assert(!tex1->texture && !tex1->sampler &&
+             !tex2->texture && !tex2->sampler);
  
        return true;
     }
@@ -422,7 +425,7 @@ instr_can_rewrite(nir_instr *instr)
        nir_tex_instr *tex = nir_instr_as_tex(instr);
  
        /* Don't support un-lowered sampler derefs currently. */
-      if (tex->sampler)
+      if (tex->texture || tex->sampler)
           return false;
  
        return true;
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h

index 5b10423..3e7cf73 100644 (file)
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -43,7 +43,7 @@
  
  
  INTRINSIC(load_var, 0, ARR(), true, 0, 1, 0, NIR_INTRINSIC_CAN_ELIMINATE)
-INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 0, 0)
+INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 1, 0)
  INTRINSIC(copy_var, 0, ARR(), false, 0, 2, 0, 0)
  
  /*
@@ -176,6 +176,52 @@ INTRINSIC(image_samples, 0, ARR(), true, 1, 1, 0,
            NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
  
  /*
+ * Vulkan descriptor set intrinsic
+ *
+ * The Vulkan API uses a different binding model from GL.  In the Vulkan
+ * API, all external resources are represented by a tripple:
+ *
+ * (descriptor set, binding, array index)
+ *
+ * where the array index is the only thing allowed to be indirect.  The
+ * vulkan_surface_index intrinsic takes the descriptor set and binding as
+ * its first two indices and the array index as its source.  The third
+ * index is a nir_variable_mode in case that's useful to the backend.
+ *
+ * The intended usage is that the shader will call vulkan_surface_index to
+ * get an index and then pass that as the buffer index ubo/ssbo calls.
+ */
+INTRINSIC(vulkan_resource_index, 1, ARR(1), true, 1, 0, 3,
+          NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+
+/*
+ * variable atomic intrinsics
+ *
+ * All of these variable atomic memory operations read a value from memory,
+ * compute a new value using one of the operations below, write the new value
+ * to memory, and return the original value read.
+ *
+ * All operations take 1 source except CompSwap that takes 2. These sources
+ * represent:
+ *
+ * 0: The data parameter to the atomic function (i.e. the value to add
+ *    in shared_atomic_add, etc).
+ * 1: For CompSwap only: the second data parameter.
+ *
+ * All operations take 1 variable deref.
+ */
+INTRINSIC(var_atomic_add, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_imin, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_umin, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_imax, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_umax, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_and, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_or, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_xor, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_exchange, 1, ARR(1), true, 1, 1, 0, 0)
+INTRINSIC(var_atomic_comp_swap, 2, ARR(1, 1), true, 1, 1, 0, 0)
+
+/*
   * SSBO atomic intrinsics
   *
   * All of the SSBO atomic memory operations read a value from memory,
@@ -239,6 +285,8 @@ SYSTEM_VALUE(vertex_id, 1, 0)
  SYSTEM_VALUE(vertex_id_zero_base, 1, 0)
  SYSTEM_VALUE(base_vertex, 1, 0)
  SYSTEM_VALUE(instance_id, 1, 0)
+SYSTEM_VALUE(base_instance, 1, 0)
+SYSTEM_VALUE(draw_id, 1, 0)
  SYSTEM_VALUE(sample_id, 1, 0)
  SYSTEM_VALUE(sample_pos, 2, 0)
  SYSTEM_VALUE(sample_mask_in, 1, 0)
@@ -294,6 +342,8 @@ LOAD(output, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE)
  LOAD(per_vertex_output, 2, 1, NIR_INTRINSIC_CAN_ELIMINATE)
  /* src[] = { offset }. const_index[] = { base } */
  LOAD(shared, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE)
+/* src[] = { offset }. const_index[] = { base, size } */
+LOAD(push_constant, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
  
  /*
   * Stores work the same way as loads, except now the first source is the value
@@ -305,13 +355,13 @@ LOAD(shared, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE)
  #define STORE(name, srcs, indices, flags) \
     INTRINSIC(store_##name, srcs, ARR(0, 1, 1, 1), false, 0, 0, indices, flags)
  
-/* src[] = { value, offset }. const_index[] = { base } */
-STORE(output, 2, 1, 0)
-/* src[] = { value, vertex, offset }. const_index[] = { base } */
-STORE(per_vertex_output, 3, 1, 0)
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+STORE(output, 2, 2, 0)
+/* src[] = { value, vertex, offset }. const_index[] = { base, write_mask } */
+STORE(per_vertex_output, 3, 2, 0)
  /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
  STORE(ssbo, 3, 1, 0)
  /* src[] = { value, offset }. const_index[] = { base, write_mask } */
-STORE(shared, 2, 1, 0)
+STORE(shared, 2, 2, 0)
  
  LAST_INTRINSIC(store_shared)
diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c

index 9313fc0..0a27e66 100644 (file)
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@@ -137,10 +137,6 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
        LOWER_REDUCTION(nir_op_bany_inequal, nir_op_ine, nir_op_ior);
        LOWER_REDUCTION(nir_op_fall_equal, nir_op_seq, nir_op_fand);
        LOWER_REDUCTION(nir_op_fany_nequal, nir_op_sne, nir_op_for);
-      LOWER_REDUCTION(nir_op_ball, nir_op_imov, nir_op_iand);
-      LOWER_REDUCTION(nir_op_bany, nir_op_imov, nir_op_ior);
-      LOWER_REDUCTION(nir_op_fall, nir_op_fmov, nir_op_fand);
-      LOWER_REDUCTION(nir_op_fany, nir_op_fmov, nir_op_for);
  
     default:
        break;
@@ -207,8 +203,8 @@ nir_lower_alu_to_scalar_impl(nir_function_impl *impl)
  void
  nir_lower_alu_to_scalar(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_lower_alu_to_scalar_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_lower_alu_to_scalar_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c

index 40ca3de..a9d0ddb 100644 (file)
--- a/src/glsl/nir/nir_lower_atomics.c
+++ b/src/glsl/nir/nir_lower_atomics.c
@@ -63,7 +63,8 @@ lower_instr(nir_intrinsic_instr *instr,
     }
  
     if (instr->variables[0]->var->data.mode != nir_var_uniform &&
-       instr->variables[0]->var->data.mode != nir_var_shader_storage)
+       instr->variables[0]->var->data.mode != nir_var_shader_storage &&
+       instr->variables[0]->var->data.mode != nir_var_shared)
        return; /* atomics passed as function arguments can't be lowered */
  
     void *mem_ctx = ralloc_parent(instr);
@@ -74,7 +75,7 @@ lower_instr(nir_intrinsic_instr *instr,
        state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index;
  
     nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
-   offset_const->value.u[0] = instr->variables[0]->var->data.atomic.offset;
+   offset_const->value.u[0] = instr->variables[0]->var->data.offset;
  
     nir_instr_insert_before(&instr->instr, &offset_const->instr);
  
@@ -156,10 +157,10 @@ nir_lower_atomics(nir_shader *shader,
        .shader_program = shader_program,
     };
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
-         nir_foreach_block(overload->impl, lower_block, (void *) &state);
-         nir_metadata_preserve(overload->impl, nir_metadata_block_index |
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         nir_foreach_block(function->impl, lower_block, (void *) &state);
+         nir_metadata_preserve(function->impl, nir_metadata_block_index |
                                                 nir_metadata_dominance);
        }
     }
diff --git a/src/glsl/nir/nir_lower_clip.c b/src/glsl/nir/nir_lower_clip.c

index 36cc578..0ca6a28 100644 (file)
--- a/src/glsl/nir/nir_lower_clip.c
+++ b/src/glsl/nir/nir_lower_clip.c
@@ -72,6 +72,7 @@ store_clipdist_output(nir_builder *b, nir_variable *out, nir_ssa_def **val)
     store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
     store->num_components = 4;
     store->const_index[0] = out->data.driver_location;
+   store->const_index[1] = 0xf;   /* wrmask */
     store->src[0].ssa = nir_vec4(b, val[0], val[1], val[2], val[3]);
     store->src[0].is_ssa = true;
     store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
@@ -143,9 +144,9 @@ find_output(nir_shader *shader, unsigned drvloc)
        .drvloc = drvloc,
     };
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
-         nir_foreach_block_reverse(overload->impl,
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         nir_foreach_block_reverse(function->impl,
                                     find_output_in_block, &state);
        }
     }
@@ -180,18 +181,11 @@ lower_clip_vs(nir_function_impl *impl, unsigned ucp_enables,
  
     for (int plane = 0; plane < MAX_CLIP_PLANES; plane++) {
        if (ucp_enables & (1 << plane)) {
-         nir_intrinsic_instr *ucp;
-
-         /* insert intrinsic to fetch ucp[plane]: */
-         ucp = nir_intrinsic_instr_create(b.shader,
-                                          nir_intrinsic_load_user_clip_plane);
-         ucp->num_components = 4;
-         ucp->const_index[0] = plane;
-         nir_ssa_dest_init(&ucp->instr, &ucp->dest, 4, NULL);
-         nir_builder_instr_insert(&b, &ucp->instr);
+         nir_ssa_def *ucp =
+            nir_load_system_value(&b, nir_intrinsic_load_user_clip_plane, plane);
  
           /* calculate clipdist[plane] - dot(ucp, cv): */
-         clipdist[plane] = nir_fdot4(&b, &ucp->dest.ssa, cv);
+         clipdist[plane] = nir_fdot4(&b, ucp, cv);
        }
        else {
           /* 0.0 == don't-clip == disabled: */
@@ -264,9 +258,9 @@ nir_lower_clip_vs(nir_shader *shader, unsigned ucp_enables)
        out[1] =
           create_clipdist_var(shader, ++maxloc, true, VARYING_SLOT_CLIP_DIST1);
  
-   nir_foreach_overload(shader, overload) {
-      if (!strcmp(overload->function->name, "main"))
-         lower_clip_vs(overload->impl, ucp_enables, cv, out);
+   nir_foreach_function(shader, function) {
+      if (!strcmp(function->name, "main"))
+         lower_clip_vs(function->impl, ucp_enables, cv, out);
     }
  }
  
@@ -338,8 +332,8 @@ nir_lower_clip_fs(nir_shader *shader, unsigned ucp_enables)
           create_clipdist_var(shader, ++maxloc, false,
                               VARYING_SLOT_CLIP_DIST1);
  
-   nir_foreach_overload(shader, overload) {
-      if (!strcmp(overload->function->name, "main"))
-         lower_clip_fs(overload->impl, ucp_enables, in);
+   nir_foreach_function(shader, function) {
+      if (!strcmp(function->name, "main"))
+         lower_clip_fs(function->impl, ucp_enables, in);
     }
  }
diff --git a/src/glsl/nir/nir_lower_global_vars_to_local.c b/src/glsl/nir/nir_lower_global_vars_to_local.c

index d549ee7..7b4cd4e 100644 (file)
--- a/src/glsl/nir/nir_lower_global_vars_to_local.c
+++ b/src/glsl/nir/nir_lower_global_vars_to_local.c
@@ -82,10 +82,10 @@ nir_lower_global_vars_to_local(nir_shader *shader)
     state.var_func_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                                    _mesa_key_pointer_equal);
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
-         state.impl = overload->impl;
-         nir_foreach_block(overload->impl, mark_global_var_uses_block, &state);
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         state.impl = function->impl;
+         nir_foreach_block(function->impl, mark_global_var_uses_block, &state);
        }
     }
  
diff --git a/src/glsl/nir/nir_lower_gs_intrinsics.c b/src/glsl/nir/nir_lower_gs_intrinsics.c

index e0d0678..fdff165 100644 (file)
--- a/src/glsl/nir/nir_lower_gs_intrinsics.c
+++ b/src/glsl/nir/nir_lower_gs_intrinsics.c
@@ -99,7 +99,8 @@ rewrite_emit_vertex(nir_intrinsic_instr *intrin, struct state *state)
  
     /* Increment the vertex count by 1 */
     nir_store_var(b, state->vertex_count_var,
-                 nir_iadd(b, count, nir_imm_int(b, 1)));
+                 nir_iadd(b, count, nir_imm_int(b, 1)),
+                 0x1); /* .x */
  
     nir_instr_remove(&intrin->instr);
  
@@ -199,18 +200,18 @@ nir_lower_gs_intrinsics(nir_shader *shader)
     exec_list_push_tail(&shader->globals, &var->node);
     state.vertex_count_var = var;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
           nir_builder b;
-         nir_builder_init(&b, overload->impl);
+         nir_builder_init(&b, function->impl);
           state.builder = &b;
  
-         nir_foreach_block(overload->impl, rewrite_intrinsics, &state);
+         nir_foreach_block(function->impl, rewrite_intrinsics, &state);
  
           /* This only works because we have a single main() function. */
-         append_set_vertex_count(overload->impl->end_block, &state);
+         append_set_vertex_count(function->impl->end_block, &state);
  
-         nir_metadata_preserve(overload->impl, 0);
+         nir_metadata_preserve(function->impl, 0);
        }
     }
  
diff --git a/src/glsl/nir/nir_lower_idiv.c b/src/glsl/nir/nir_lower_idiv.c

index f64b3ea..a084ad9 100644 (file)
--- a/src/glsl/nir/nir_lower_idiv.c
+++ b/src/glsl/nir/nir_lower_idiv.c
@@ -144,8 +144,8 @@ convert_impl(nir_function_impl *impl)
  void
  nir_lower_idiv(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         convert_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         convert_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_lower_indirect_derefs.c b/src/glsl/nir/nir_lower_indirect_derefs.c

new file mode 100644 (file)

index 0000000..69f2df4
--- /dev/null
+++ b/src/glsl/nir/nir_lower_indirect_derefs.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+static void
+emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
+                nir_deref_var *deref, nir_deref *tail,
+                nir_ssa_def **dest, nir_ssa_def *src);
+
+static void
+emit_indirect_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
+                         nir_deref_var *deref, nir_deref *arr_parent,
+                         int start, int end,
+                         nir_ssa_def **dest, nir_ssa_def *src)
+{
+   assert(arr_parent->child &&
+          arr_parent->child->deref_type == nir_deref_type_array);
+   nir_deref_array *arr = nir_deref_as_array(arr_parent->child);
+   assert(arr->deref_array_type == nir_deref_array_type_indirect);
+   assert(arr->indirect.is_ssa);
+
+   assert(start < end);
+   if (start == end - 1) {
+      /* Base case.  Just emit the load/store op */
+      nir_deref_array direct = *arr;
+      direct.deref_array_type = nir_deref_array_type_direct;
+      direct.base_offset += start;
+      direct.indirect = NIR_SRC_INIT;
+
+      arr_parent->child = &direct.deref;
+      emit_load_store(b, orig_instr, deref, &arr->deref, dest, src);
+      arr_parent->child = &arr->deref;
+   } else {
+      int mid = start + (end - start) / 2;
+
+      nir_ssa_def *then_dest, *else_dest;
+
+      nir_if *if_stmt = nir_if_create(b->shader);
+      if_stmt->condition = nir_src_for_ssa(nir_ilt(b, arr->indirect.ssa,
+                                                      nir_imm_int(b, mid)));
+      nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
+
+      b->cursor = nir_after_cf_list(&if_stmt->then_list);
+      emit_indirect_load_store(b, orig_instr, deref, arr_parent,
+                               start, mid, &then_dest, src);
+
+      b->cursor = nir_after_cf_list(&if_stmt->else_list);
+      emit_indirect_load_store(b, orig_instr, deref, arr_parent,
+                               mid, end, &else_dest, src);
+
+      b->cursor = nir_after_cf_node(&if_stmt->cf_node);
+
+      if (src == NULL) {
+         /* We're a load.  We need to insert a phi node */
+         nir_phi_instr *phi = nir_phi_instr_create(b->shader);
+         nir_ssa_dest_init(&phi->instr, &phi->dest,
+                           then_dest->num_components, NULL);
+
+         nir_phi_src *src0 = ralloc(phi, nir_phi_src);
+         src0->pred = nir_cf_node_as_block(nir_if_last_then_node(if_stmt));
+         src0->src = nir_src_for_ssa(then_dest);
+         exec_list_push_tail(&phi->srcs, &src0->node);
+
+         nir_phi_src *src1 = ralloc(phi, nir_phi_src);
+         src1->pred = nir_cf_node_as_block(nir_if_last_else_node(if_stmt));
+         src1->src = nir_src_for_ssa(else_dest);
+         exec_list_push_tail(&phi->srcs, &src1->node);
+
+         nir_builder_instr_insert(b, &phi->instr);
+         *dest = &phi->dest.ssa;
+      }
+   }
+}
+
+static void
+emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
+                nir_deref_var *deref, nir_deref *tail,
+                nir_ssa_def **dest, nir_ssa_def *src)
+{
+   for (; tail->child; tail = tail->child) {
+      if (tail->child->deref_type != nir_deref_type_array)
+         continue;
+
+      nir_deref_array *arr = nir_deref_as_array(tail->child);
+      if (arr->deref_array_type != nir_deref_array_type_indirect)
+         continue;
+
+      int length = glsl_get_length(tail->type);
+
+      emit_indirect_load_store(b, orig_instr, deref, tail, -arr->base_offset,
+                               length - arr->base_offset, dest, src);
+      return;
+   }
+
+   assert(tail && tail->child == NULL);
+
+   /* We reached the end of the deref chain.  Emit the instruction */
+
+   if (src == NULL) {
+      /* This is a load instruction */
+      nir_intrinsic_instr *load =
+         nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var);
+      load->num_components = orig_instr->num_components;
+      load->variables[0] =
+         nir_deref_as_var(nir_copy_deref(load, &deref->deref));
+      nir_ssa_dest_init(&load->instr, &load->dest,
+                        load->num_components, NULL);
+      nir_builder_instr_insert(b, &load->instr);
+      *dest = &load->dest.ssa;
+   } else {
+      /* This is a store instruction */
+      nir_intrinsic_instr *store =
+         nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
+      store->num_components = orig_instr->num_components;
+      store->const_index[0] = orig_instr->const_index[0]; /* writemask */
+      store->variables[0] =
+         nir_deref_as_var(nir_copy_deref(store, &deref->deref));
+      store->src[0] = nir_src_for_ssa(src);
+      nir_builder_instr_insert(b, &store->instr);
+   }
+}
+
+static bool
+deref_has_indirect(nir_deref_var *deref)
+{
+   for (nir_deref *tail = deref->deref.child; tail; tail = tail->child) {
+      if (tail->deref_type != nir_deref_type_array)
+         continue;
+
+      nir_deref_array *arr = nir_deref_as_array(tail);
+      if (arr->deref_array_type == nir_deref_array_type_indirect)
+         return true;
+   }
+
+   return false;
+}
+
+struct lower_indirect_state {
+   nir_builder builder;
+   uint32_t mode_mask;
+   bool progress;
+};
+
+static bool
+lower_indirect_block(nir_block *block, void *void_state)
+{
+   struct lower_indirect_state *state = void_state;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      if (intrin->intrinsic != nir_intrinsic_load_var &&
+          intrin->intrinsic != nir_intrinsic_store_var)
+         continue;
+
+      if (!deref_has_indirect(intrin->variables[0]))
+         continue;
+
+      /* Only lower variables whose mode is in the mask */
+      if (!(state->mode_mask & (1 << intrin->variables[0]->var->data.mode)))
+         continue;
+
+      state->builder.cursor = nir_before_instr(&intrin->instr);
+
+      if (intrin->intrinsic == nir_intrinsic_load_var) {
+         nir_ssa_def *result;
+         emit_load_store(&state->builder, intrin, intrin->variables[0],
+                         &intrin->variables[0]->deref, &result, NULL);
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(result));
+      } else {
+         assert(intrin->src[0].is_ssa);
+         emit_load_store(&state->builder, intrin, intrin->variables[0],
+                         &intrin->variables[0]->deref, NULL, intrin->src[0].ssa);
+      }
+      nir_instr_remove(&intrin->instr);
+      state->progress = true;
+   }
+
+   return true;
+}
+
+static bool
+lower_indirects_impl(nir_function_impl *impl, uint32_t mode_mask)
+{
+   struct lower_indirect_state state;
+
+   state.progress = false;
+   state.mode_mask = mode_mask;
+   nir_builder_init(&state.builder, impl);
+
+   nir_foreach_block(impl, lower_indirect_block, &state);
+
+   if (state.progress)
+      nir_metadata_preserve(impl, nir_metadata_none);
+
+   return state.progress;
+}
+
+/** Lowers indirect variable loads/stores to direct loads/stores.
+ *
+ * The pass works by replacing any indirect load or store with an if-ladder
+ * that does a binary search on the array index.
+ */
+bool
+nir_lower_indirect_derefs(nir_shader *shader, uint32_t mode_mask)
+{
+   bool progress = false;
+
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = lower_indirects_impl(function->impl, mode_mask) || progress;
+   }
+
+   return progress;
+}
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c

index c1f034b..2c5fa16 100644 (file)
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -160,12 +160,56 @@ load_op(struct lower_io_state *state,
     case nir_var_uniform:
        op = nir_intrinsic_load_uniform;
        break;
+   case nir_var_shared:
+      op = nir_intrinsic_load_shared;
+      break;
     default:
        unreachable("Unknown variable mode");
     }
     return op;
  }
  
+static nir_intrinsic_op
+store_op(struct lower_io_state *state,
+         nir_variable_mode mode, bool per_vertex)
+{
+   nir_intrinsic_op op;
+   switch (mode) {
+   case nir_var_shader_in:
+   case nir_var_shader_out:
+      op = per_vertex ? nir_intrinsic_store_per_vertex_output :
+                        nir_intrinsic_store_output;
+      break;
+   case nir_var_shared:
+      op = nir_intrinsic_store_shared;
+      break;
+   default:
+      unreachable("Unknown variable mode");
+   }
+   return op;
+}
+
+static nir_intrinsic_op
+atomic_op(nir_intrinsic_op opcode)
+{
+   switch (opcode) {
+#define OP(O) case nir_intrinsic_var_##O: return nir_intrinsic_shared_##O;
+   OP(atomic_exchange)
+   OP(atomic_comp_swap)
+   OP(atomic_add)
+   OP(atomic_imin)
+   OP(atomic_umin)
+   OP(atomic_imax)
+   OP(atomic_umax)
+   OP(atomic_and)
+   OP(atomic_or)
+   OP(atomic_xor)
+#undef OP
+   default:
+      unreachable("Invalid atomic");
+   }
+}
+
  static bool
  nir_lower_io_block(nir_block *block, void *void_state)
  {
@@ -179,9 +223,25 @@ nir_lower_io_block(nir_block *block, void *void_state)
  
        nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
  
-      if (intrin->intrinsic != nir_intrinsic_load_var &&
-          intrin->intrinsic != nir_intrinsic_store_var)
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_var:
+      case nir_intrinsic_store_var:
+      case nir_intrinsic_var_atomic_add:
+      case nir_intrinsic_var_atomic_imin:
+      case nir_intrinsic_var_atomic_umin:
+      case nir_intrinsic_var_atomic_imax:
+      case nir_intrinsic_var_atomic_umax:
+      case nir_intrinsic_var_atomic_and:
+      case nir_intrinsic_var_atomic_or:
+      case nir_intrinsic_var_atomic_xor:
+      case nir_intrinsic_var_atomic_exchange:
+      case nir_intrinsic_var_atomic_comp_swap:
+         /* We can lower the io for this nir instrinsic */
+         break;
+      default:
+         /* We can't lower the io for this nir instrinsic, so skip it */
           continue;
+      }
  
        nir_variable_mode mode = intrin->variables[0]->var->data.mode;
  
@@ -190,6 +250,7 @@ nir_lower_io_block(nir_block *block, void *void_state)
  
        if (mode != nir_var_shader_in &&
            mode != nir_var_shader_out &&
+          mode != nir_var_shared &&
            mode != nir_var_uniform)
           continue;
  
@@ -241,7 +302,7 @@ nir_lower_io_block(nir_block *block, void *void_state)
        }
  
        case nir_intrinsic_store_var: {
-         assert(mode == nir_var_shader_out);
+         assert(mode == nir_var_shader_out || mode == nir_var_shared);
  
           nir_ssa_def *offset;
           nir_ssa_def *vertex_index;
@@ -253,12 +314,9 @@ nir_lower_io_block(nir_block *block, void *void_state)
                                  per_vertex ? &vertex_index : NULL,
                                  state->type_size);
  
-         nir_intrinsic_op store_op =
-            per_vertex ? nir_intrinsic_store_per_vertex_output :
-                         nir_intrinsic_store_output;
-
-         nir_intrinsic_instr *store = nir_intrinsic_instr_create(state->mem_ctx,
-                                                                 store_op);
+         nir_intrinsic_instr *store =
+            nir_intrinsic_instr_create(state->mem_ctx,
+                                       store_op(state, mode, per_vertex));
           store->num_components = intrin->num_components;
  
           nir_src_copy(&store->src[0], &intrin->src[0], store);
@@ -266,6 +324,9 @@ nir_lower_io_block(nir_block *block, void *void_state)
           store->const_index[0] =
              intrin->variables[0]->var->data.driver_location;
  
+         /* Copy the writemask */
+         store->const_index[1] = intrin->const_index[0];
+
           if (per_vertex)
              store->src[1] = nir_src_for_ssa(vertex_index);
  
@@ -276,6 +337,51 @@ nir_lower_io_block(nir_block *block, void *void_state)
           break;
        }
  
+      case nir_intrinsic_var_atomic_add:
+      case nir_intrinsic_var_atomic_imin:
+      case nir_intrinsic_var_atomic_umin:
+      case nir_intrinsic_var_atomic_imax:
+      case nir_intrinsic_var_atomic_umax:
+      case nir_intrinsic_var_atomic_and:
+      case nir_intrinsic_var_atomic_or:
+      case nir_intrinsic_var_atomic_xor:
+      case nir_intrinsic_var_atomic_exchange:
+      case nir_intrinsic_var_atomic_comp_swap: {
+         assert(mode == nir_var_shared);
+
+         nir_ssa_def *offset;
+
+         offset = get_io_offset(b, intrin->variables[0],
+                                NULL, state->type_size);
+
+         nir_intrinsic_instr *atomic =
+            nir_intrinsic_instr_create(state->mem_ctx,
+                                       atomic_op(intrin->intrinsic));
+
+         atomic->src[0] = nir_src_for_ssa(offset);
+
+         atomic->const_index[0] =
+            intrin->variables[0]->var->data.driver_location;
+
+         nir_src_copy(&atomic->src[1], &intrin->src[0], atomic);
+
+         if (intrin->intrinsic == nir_intrinsic_var_atomic_comp_swap)
+            nir_src_copy(&atomic->src[2], &intrin->src[1], atomic);
+
+         if (intrin->dest.is_ssa) {
+            nir_ssa_dest_init(&atomic->instr, &atomic->dest,
+                              intrin->dest.ssa.num_components, NULL);
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                     nir_src_for_ssa(&atomic->dest.ssa));
+         } else {
+            nir_dest_copy(&atomic->dest, &intrin->dest, state->mem_ctx);
+         }
+
+         nir_instr_insert_before(&intrin->instr, &atomic->instr);
+         nir_instr_remove(&intrin->instr);
+         break;
+      }
+
        default:
           break;
        }
@@ -306,9 +412,9 @@ void
  nir_lower_io(nir_shader *shader, nir_variable_mode mode,
               int (*type_size)(const struct glsl_type *))
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_lower_io_impl(overload->impl, mode, type_size);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_lower_io_impl(function->impl, mode, type_size);
     }
  }
  
@@ -323,10 +429,13 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr)
     case nir_intrinsic_load_output:
     case nir_intrinsic_load_uniform:
        return &instr->src[0];
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ssbo:
     case nir_intrinsic_load_per_vertex_input:
     case nir_intrinsic_load_per_vertex_output:
     case nir_intrinsic_store_output:
        return &instr->src[1];
+   case nir_intrinsic_store_ssbo:
     case nir_intrinsic_store_per_vertex_output:
        return &instr->src[2];
     default:
diff --git a/src/glsl/nir/nir_lower_load_const_to_scalar.c b/src/glsl/nir/nir_lower_load_const_to_scalar.c

index 84d0c14..1eeed13 100644 (file)
--- a/src/glsl/nir/nir_lower_load_const_to_scalar.c
+++ b/src/glsl/nir/nir_lower_load_const_to_scalar.c
@@ -82,8 +82,8 @@ nir_lower_load_const_to_scalar_impl(nir_function_impl *impl)
  void
  nir_lower_load_const_to_scalar(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_lower_load_const_to_scalar_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_lower_load_const_to_scalar_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_lower_locals_to_regs.c b/src/glsl/nir/nir_lower_locals_to_regs.c

index 17b53ca..51b0fa7 100644 (file)
--- a/src/glsl/nir/nir_lower_locals_to_regs.c
+++ b/src/glsl/nir/nir_lower_locals_to_regs.c
@@ -243,7 +243,7 @@ lower_locals_to_regs_block(nir_block *block, void *void_state)
  
           nir_alu_instr *mov = nir_alu_instr_create(state->shader, nir_op_imov);
           nir_src_copy(&mov->src[0].src, &intrin->src[0], mov);
-         mov->dest.write_mask = (1 << intrin->num_components) - 1;
+         mov->dest.write_mask = intrin->const_index[0];
           mov->dest.dest.is_ssa = false;
           mov->dest.dest.reg.reg = reg_src.reg.reg;
           mov->dest.dest.reg.base_offset = reg_src.reg.base_offset;
@@ -348,7 +348,7 @@ nir_lower_locals_to_regs_impl(nir_function_impl *impl)
  {
     struct locals_to_regs_state state;
  
-   state.shader = impl->overload->function->shader;
+   state.shader = impl->function->shader;
     state.impl = impl;
     state.progress = false;
     state.regs_table = _mesa_hash_table_create(NULL, hash_deref, derefs_equal);
@@ -387,9 +387,9 @@ nir_lower_locals_to_regs(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress = nir_lower_locals_to_regs_impl(overload->impl) || progress;
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = nir_lower_locals_to_regs_impl(function->impl) || progress;
     }
  
     return progress;
diff --git a/src/glsl/nir/nir_lower_outputs_to_temporaries.c b/src/glsl/nir/nir_lower_outputs_to_temporaries.c

index 9441f47..00ac091 100644 (file)
--- a/src/glsl/nir/nir_lower_outputs_to_temporaries.c
+++ b/src/glsl/nir/nir_lower_outputs_to_temporaries.c
@@ -74,7 +74,7 @@ emit_output_copies_block(nir_block *block, void *state)
  }
  
  void
-nir_lower_outputs_to_temporaries(nir_shader *shader)
+nir_lower_outputs_to_temporaries(nir_shader *shader, nir_function *entrypoint)
  {
     struct lower_outputs_state state;
  
@@ -97,6 +97,9 @@ nir_lower_outputs_to_temporaries(nir_shader *shader)
        /* Reparent the name to the new variable */
        ralloc_steal(output, output->name);
  
+      /* Reparent the constant initializer (if any) */
+      ralloc_steal(output, output->constant_initializer);
+
        /* Give the output a new name with @out-temp appended */
        temp->name = ralloc_asprintf(var, "%s@out-temp", output->name);
        temp->data.mode = nir_var_global;
@@ -105,27 +108,27 @@ nir_lower_outputs_to_temporaries(nir_shader *shader)
        exec_list_push_tail(&shader->outputs, &output->node);
     }
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl == NULL)
+   nir_foreach_function(shader, function) {
+      if (function->impl == NULL)
           continue;
  
        if (shader->stage == MESA_SHADER_GEOMETRY) {
           /* For geometry shaders, we have to emit the output copies right
            * before each EmitVertex call.
            */
-         nir_foreach_block(overload->impl, emit_output_copies_block, &state);
-      } else if (strcmp(overload->function->name, "main") == 0) {
+         nir_foreach_block(function->impl, emit_output_copies_block, &state);
+      } else if (function == entrypoint) {
           /* For all other shader types, we need to do the copies right before
            * the jumps to the end block.
            */
           struct set_entry *block_entry;
-         set_foreach(overload->impl->end_block->predecessors, block_entry) {
+         set_foreach(function->impl->end_block->predecessors, block_entry) {
              struct nir_block *block = (void *)block_entry->key;
              emit_output_copies(nir_after_block_before_jump(block), &state);
           }
        }
  
-      nir_metadata_preserve(overload->impl, nir_metadata_block_index |
+      nir_metadata_preserve(function->impl, nir_metadata_block_index |
                                              nir_metadata_dominance);
     }
  
diff --git a/src/glsl/nir/nir_lower_phis_to_scalar.c b/src/glsl/nir/nir_lower_phis_to_scalar.c

index 2f5927f..dd2abcf 100644 (file)
--- a/src/glsl/nir/nir_lower_phis_to_scalar.c
+++ b/src/glsl/nir/nir_lower_phis_to_scalar.c
@@ -286,8 +286,8 @@ lower_phis_to_scalar_impl(nir_function_impl *impl)
  void
  nir_lower_phis_to_scalar(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         lower_phis_to_scalar_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         lower_phis_to_scalar_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_lower_returns.c b/src/glsl/nir/nir_lower_returns.c

new file mode 100644 (file)

index 0000000..91bb2f7
--- /dev/null
+++ b/src/glsl/nir/nir_lower_returns.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_control_flow.h"
+
+struct lower_returns_state {
+   nir_builder builder;
+   struct exec_list *cf_list;
+   nir_loop *loop;
+   nir_variable *return_flag;
+};
+
+static bool lower_returns_in_cf_list(struct exec_list *cf_list,
+                                     struct lower_returns_state *state);
+
+static void
+predicate_following(nir_cf_node *node, struct lower_returns_state *state)
+{
+   nir_builder *b = &state->builder;
+   b->cursor = nir_after_cf_node_and_phis(node);
+
+   if (nir_cursors_equal(b->cursor, nir_after_cf_list(state->cf_list)))
+      return; /* Nothing to predicate */
+
+   assert(state->return_flag);
+
+   nir_if *if_stmt = nir_if_create(b->shader);
+   if_stmt->condition = nir_src_for_ssa(nir_load_var(b, state->return_flag));
+   nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
+
+   if (state->loop) {
+      /* If we're inside of a loop, then all we need to do is insert a
+       * conditional break.
+       */
+      nir_jump_instr *brk =
+         nir_jump_instr_create(state->builder.shader, nir_jump_break);
+      nir_instr_insert(nir_before_cf_list(&if_stmt->then_list), &brk->instr);
+   } else {
+      /* Otherwise, we need to actually move everything into the else case
+       * of the if statement.
+       */
+      nir_cf_list list;
+      nir_cf_extract(&list, nir_after_cf_node(&if_stmt->cf_node),
+                            nir_after_cf_list(state->cf_list));
+      assert(!exec_list_is_empty(&list.list));
+      nir_cf_reinsert(&list, nir_before_cf_list(&if_stmt->else_list));
+   }
+}
+
+static bool
+lower_returns_in_loop(nir_loop *loop, struct lower_returns_state *state)
+{
+   nir_loop *parent = state->loop;
+   state->loop = loop;
+   bool progress = lower_returns_in_cf_list(&loop->body, state);
+   state->loop = parent;
+
+   /* If the recursive call made progress, then there were returns inside
+    * of the loop.  These would have been lowered to breaks with the return
+    * flag set to true.  We need to predicate everything following the loop
+    * on the return flag.
+    */
+   if (progress)
+      predicate_following(&loop->cf_node, state);
+
+   return progress;
+}
+
+static bool
+lower_returns_in_if(nir_if *if_stmt, struct lower_returns_state *state)
+{
+   bool progress;
+
+   progress = lower_returns_in_cf_list(&if_stmt->then_list, state);
+   progress = lower_returns_in_cf_list(&if_stmt->else_list, state) || progress;
+
+   /* If either of the recursive calls made progress, then there were
+    * returns inside of the body of the if.  If we're in a loop, then these
+    * were lowered to breaks which automatically skip to the end of the
+    * loop so we don't have to do anything.  If we're not in a loop, then
+    * all we know is that the return flag is set appropreately and that the
+    * recursive calls ensured that nothing gets executed *inside* the if
+    * after a return.  In order to ensure nothing outside gets executed
+    * after a return, we need to predicate everything following on the
+    * return flag.
+    */
+   if (progress && !state->loop)
+      predicate_following(&if_stmt->cf_node, state);
+
+   return progress;
+}
+
+static bool
+lower_returns_in_block(nir_block *block, struct lower_returns_state *state)
+{
+   if (block->predecessors->entries == 0 &&
+       block != nir_start_block(state->builder.impl)) {
+      /* This block is unreachable.  Delete it and everything after it. */
+      nir_cf_list list;
+      nir_cf_extract(&list, nir_before_cf_node(&block->cf_node),
+                            nir_after_cf_list(state->cf_list));
+
+      if (exec_list_is_empty(&list.list)) {
+         /* There's nothing here, which also means there's nothing in this
+          * block so we have nothing to do.
+          */
+         return false;
+      } else {
+         nir_cf_delete(&list);
+         return true;
+      }
+   }
+
+   nir_instr *last_instr = nir_block_last_instr(block);
+   if (last_instr == NULL)
+      return false;
+
+   if (last_instr->type != nir_instr_type_jump)
+      return false;
+
+   nir_jump_instr *jump = nir_instr_as_jump(last_instr);
+   if (jump->type != nir_jump_return)
+      return false;
+
+   nir_instr_remove(&jump->instr);
+
+   nir_builder *b = &state->builder;
+   b->cursor = nir_after_block(block);
+
+   /* Set the return flag */
+   if (state->return_flag == NULL) {
+      state->return_flag =
+         nir_local_variable_create(b->impl, glsl_bool_type(), "return");
+
+      /* Set a default value of false */
+      state->return_flag->constant_initializer =
+         rzalloc(state->return_flag, nir_constant);
+   }
+   nir_store_var(b, state->return_flag, nir_imm_int(b, NIR_TRUE), 1);
+
+   if (state->loop) {
+      /* We're in a loop;  we need to break out of it. */
+      nir_jump(b, nir_jump_break);
+   } else {
+      /* Not in a loop;  we'll deal with predicating later*/
+      assert(nir_cf_node_next(&block->cf_node) == NULL);
+   }
+
+   return true;
+}
+
+static bool
+lower_returns_in_cf_list(struct exec_list *cf_list,
+                         struct lower_returns_state *state)
+{
+   bool progress = false;
+
+   struct exec_list *parent_list = state->cf_list;
+   state->cf_list = cf_list;
+
+   /* We iterate over the list backwards because any given lower call may
+    * take everything following the given CF node and predicate it.  In
+    * order to avoid recursion/iteration problems, we want everything after
+    * a given node to already be lowered before this happens.
+    */
+   foreach_list_typed_reverse_safe(nir_cf_node, node, node, cf_list) {
+      switch (node->type) {
+      case nir_cf_node_block:
+         if (lower_returns_in_block(nir_cf_node_as_block(node), state))
+            progress = true;
+         break;
+
+      case nir_cf_node_if:
+         if (lower_returns_in_if(nir_cf_node_as_if(node), state))
+            progress = true;
+         break;
+
+      case nir_cf_node_loop:
+         if (lower_returns_in_loop(nir_cf_node_as_loop(node), state))
+            progress = true;
+         break;
+
+      default:
+         unreachable("Invalid inner CF node type");
+      }
+   }
+
+   state->cf_list = parent_list;
+
+   return progress;
+}
+
+bool
+nir_lower_returns_impl(nir_function_impl *impl)
+{
+   struct lower_returns_state state;
+
+   state.cf_list = &impl->body;
+   state.loop = NULL;
+   state.return_flag = NULL;
+   nir_builder_init(&state.builder, impl);
+
+   bool progress = lower_returns_in_cf_list(&impl->body, &state);
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_none);
+      nir_repair_ssa_impl(impl);
+   }
+
+   return progress;
+}
+
+bool
+nir_lower_returns(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = nir_lower_returns_impl(function->impl) || progress;
+   }
+
+   return progress;
+}
diff --git a/src/glsl/nir/nir_lower_samplers.c b/src/glsl/nir/nir_lower_samplers.c

index 2aab305..e4cd415 100644 (file)
--- a/src/glsl/nir/nir_lower_samplers.c
+++ b/src/glsl/nir/nir_lower_samplers.c
@@ -94,6 +94,9 @@ lower_sampler(nir_tex_instr *instr, const struct gl_shader_program *shader_progr
     if (instr->sampler == NULL)
        return;
  
+   /* GLSL only has combined textures/samplers */
+   assert(instr->texture == NULL);
+
     instr->sampler_index = 0;
     unsigned location = instr->sampler->var->data.location;
     unsigned array_elements = 1;
@@ -106,7 +109,7 @@ lower_sampler(nir_tex_instr *instr, const struct gl_shader_program *shader_progr
     if (indirect) {
        /* First, we have to resize the array of texture sources */
        nir_tex_src *new_srcs = rzalloc_array(instr, nir_tex_src,
-                                            instr->num_srcs + 1);
+                                            instr->num_srcs + 2);
  
        for (unsigned i = 0; i < instr->num_srcs; i++) {
           new_srcs[i].src_type = instr->src[i].src_type;
@@ -120,13 +123,19 @@ lower_sampler(nir_tex_instr *instr, const struct gl_shader_program *shader_progr
        /* Now we can go ahead and move the source over to being a
         * first-class texture source.
         */
+      instr->src[instr->num_srcs].src_type = nir_tex_src_texture_offset;
+      instr->num_srcs++;
+      nir_instr_rewrite_src(&instr->instr,
+                            &instr->src[instr->num_srcs - 1].src,
+                            nir_src_for_ssa(indirect));
+
        instr->src[instr->num_srcs].src_type = nir_tex_src_sampler_offset;
        instr->num_srcs++;
        nir_instr_rewrite_src(&instr->instr,
                              &instr->src[instr->num_srcs - 1].src,
                              nir_src_for_ssa(indirect));
  
-      instr->sampler_array_size = array_elements;
+      instr->texture_array_size = array_elements;
     }
  
     if (location > shader_program->NumUniformStorage - 1 ||
@@ -139,6 +148,8 @@ lower_sampler(nir_tex_instr *instr, const struct gl_shader_program *shader_progr
        shader_program->UniformStorage[location].opaque[stage].index;
  
     instr->sampler = NULL;
+
+   instr->texture_index = instr->sampler_index;
  }
  
  typedef struct {
@@ -180,8 +191,8 @@ void
  nir_lower_samplers(nir_shader *shader,
                     const struct gl_shader_program *shader_program)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         lower_impl(overload->impl, shader_program, shader->stage);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         lower_impl(function->impl, shader_program, shader->stage);
     }
  }
diff --git a/src/glsl/nir/nir_lower_samplers.cpp b/src/glsl/nir/nir_lower_samplers.cpp

new file mode 100644 (file)

index 0000000..438caac
--- /dev/null
+++ b/src/glsl/nir/nir_lower_samplers.cpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "../program.h"
+#include "program/hash_table.h"
+#include "ir_uniform.h"
+
+extern "C" {
+#include "main/compiler.h"
+#include "main/mtypes.h"
+#include "program/prog_parameter.h"
+#include "program/program.h"
+}
+
+static void
+add_indirect_to_tex(nir_tex_instr *instr, nir_src indirect)
+{
+   /* First, we have to resize the array of texture sources */
+   nir_tex_src *new_srcs = rzalloc_array(instr, nir_tex_src,
+                                         instr->num_srcs + 1);
+
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      new_srcs[i].src_type = instr->src[i].src_type;
+      nir_instr_move_src(&instr->instr, &new_srcs[i].src, &instr->src[i].src);
+   }
+
+   ralloc_free(instr->src);
+   instr->src = new_srcs;
+
+   /* Now we can go ahead and move the source over to being a
+    * first-class texture source.
+    */
+   instr->src[instr->num_srcs].src_type = nir_tex_src_sampler_offset;
+   instr->num_srcs++;
+   nir_instr_rewrite_src(&instr->instr, &instr->src[instr->num_srcs - 1].src,
+                         indirect);
+}
+
+static unsigned
+get_sampler_index(const struct gl_shader_program *shader_program,
+                  gl_shader_stage stage, const char *name)
+{
+   unsigned location;
+   if (!shader_program->UniformHash->get(location, name)) {
+      assert(!"failed to find sampler");
+      return 0;
+   }
+
+   if (!shader_program->UniformStorage[location].sampler[stage].active) {
+      assert(!"cannot return a sampler");
+      return 0;
+   }
+
+   return shader_program->UniformStorage[location].sampler[stage].index;
+}
+
+static void
+lower_sampler(nir_tex_instr *instr, const struct gl_shader_program *shader_program,
+              gl_shader_stage stage, void *mem_ctx)
+{
+   if (instr->sampler == NULL)
+      return;
+
+   /* Get the name and the offset */
+   instr->sampler_index = 0;
+   char *name = ralloc_strdup(mem_ctx, instr->sampler->var->name);
+
+   for (nir_deref *deref = &instr->sampler->deref;
+        deref->child; deref = deref->child) {
+      switch (deref->child->deref_type) {
+      case nir_deref_type_array: {
+         nir_deref_array *deref_array = nir_deref_as_array(deref->child);
+
+         assert(deref_array->deref_array_type != nir_deref_array_type_wildcard);
+
+         if (deref_array->deref.child) {
+            ralloc_asprintf_append(&name, "[%u]",
+               deref_array->deref_array_type == nir_deref_array_type_direct ?
+                  deref_array->base_offset : 0);
+         } else {
+            assert(deref->child->type->base_type == GLSL_TYPE_SAMPLER);
+            instr->sampler_index = deref_array->base_offset;
+         }
+
+         /* XXX: We're assuming here that the indirect is the last array
+          * thing we have.  This should be ok for now as we don't support
+          * arrays_of_arrays yet.
+          */
+         if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+            /* First, we have to resize the array of texture sources */
+            nir_tex_src *new_srcs = rzalloc_array(instr, nir_tex_src,
+                                                  instr->num_srcs + 1);
+
+            for (unsigned i = 0; i < instr->num_srcs; i++) {
+               new_srcs[i].src_type = instr->src[i].src_type;
+               nir_instr_move_src(&instr->instr, &new_srcs[i].src,
+                                  &instr->src[i].src);
+            }
+
+            ralloc_free(instr->src);
+            instr->src = new_srcs;
+
+            /* Now we can go ahead and move the source over to being a
+             * first-class texture source.
+             */
+            instr->src[instr->num_srcs].src_type = nir_tex_src_sampler_offset;
+            instr->num_srcs++;
+            nir_instr_move_src(&instr->instr,
+                               &instr->src[instr->num_srcs - 1].src,
+                               &deref_array->indirect);
+
+            instr->sampler_array_size = glsl_get_length(deref->type);
+         }
+         break;
+      }
+
+      case nir_deref_type_struct: {
+         nir_deref_struct *deref_struct = nir_deref_as_struct(deref->child);
+         const char *field = glsl_get_struct_elem_name(deref->type,
+                                                       deref_struct->index);
+         ralloc_asprintf_append(&name, ".%s", field);
+         break;
+      }
+
+      default:
+         unreachable("Invalid deref type");
+         break;
+      }
+   }
+
+   instr->sampler_index += get_sampler_index(shader_program, stage, name);
+
+   instr->sampler = NULL;
+}
+
+typedef struct {
+   void *mem_ctx;
+   const struct gl_shader_program *shader_program;
+   gl_shader_stage stage;
+} lower_state;
+
+static bool
+lower_block_cb(nir_block *block, void *_state)
+{
+   lower_state *state = (lower_state *) _state;
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type == nir_instr_type_tex) {
+         nir_tex_instr *tex_instr = nir_instr_as_tex(instr);
+         lower_sampler(tex_instr, state->shader_program, state->stage,
+                       state->mem_ctx);
+      }
+   }
+
+   return true;
+}
+
+static void
+lower_impl(nir_function_impl *impl, const struct gl_shader_program *shader_program,
+           gl_shader_stage stage)
+{
+   lower_state state;
+
+   state.mem_ctx = ralloc_parent(impl);
+   state.shader_program = shader_program;
+   state.stage = stage;
+
+   nir_foreach_block(impl, lower_block_cb, &state);
+}
+
+extern "C" void
+nir_lower_samplers(nir_shader *shader,
+                   const struct gl_shader_program *shader_program)
+{
+   nir_foreach_overload(shader, overload) {
+      if (overload->impl)
+         lower_impl(overload->impl, shader_program, shader->stage);
+   }
+}
+
+static bool
+lower_samplers_for_vk_block(nir_block *block, void *data)
+{
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_tex)
+         continue;
+
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+      assert(tex->sampler);
+
+      tex->sampler_set = tex->sampler->var->data.descriptor_set;
+      tex->sampler_index = tex->sampler->var->data.binding;
+
+      if (tex->sampler->deref.child) {
+         assert(tex->sampler->deref.child->deref_type == nir_deref_type_array);
+         nir_deref_array *arr = nir_deref_as_array(tex->sampler->deref.child);
+
+         /* Only one-level arrays are allowed in vulkan */
+         assert(arr->deref.child == NULL);
+
+         tex->sampler_index += arr->base_offset;
+         if (arr->deref_array_type == nir_deref_array_type_indirect) {
+            add_indirect_to_tex(tex, arr->indirect);
+            nir_instr_rewrite_src(instr, &arr->indirect, NIR_SRC_INIT);
+
+            tex->sampler_array_size = glsl_get_length(tex->sampler->deref.type);
+         }
+      }
+
+      tex->sampler = NULL;
+   }
+
+   return true;
+}
+
+extern "C" void
+nir_lower_samplers_for_vk(nir_shader *shader)
+{
+   nir_foreach_overload(shader, overload) {
+      if (overload->impl) {
+         nir_foreach_block(overload->impl, lower_samplers_for_vk_block, NULL);
+      }
+   }
+}
diff --git a/src/glsl/nir/nir_lower_system_values.c b/src/glsl/nir/nir_lower_system_values.c

index 21904f8..69d0554 100644 (file)
--- a/src/glsl/nir/nir_lower_system_values.c
+++ b/src/glsl/nir/nir_lower_system_values.c
@@ -26,46 +26,105 @@
   */
  
  #include "nir.h"
-#include "main/mtypes.h"
+#include "nir_builder.h"
  
-static bool
-convert_instr(nir_intrinsic_instr *instr)
-{
-   if (instr->intrinsic != nir_intrinsic_load_var)
-      return false;
-
-   nir_variable *var = instr->variables[0]->var;
-   if (var->data.mode != nir_var_system_value)
-      return false;
-
-   void *mem_ctx = ralloc_parent(instr);
-
-   nir_intrinsic_op op = nir_intrinsic_from_system_value(var->data.location);
-   nir_intrinsic_instr *new_instr = nir_intrinsic_instr_create(mem_ctx, op);
-
-   if (instr->dest.is_ssa) {
-      nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
-                        instr->dest.ssa.num_components, NULL);
-      nir_ssa_def_rewrite_uses(&instr->dest.ssa,
-                               nir_src_for_ssa(&new_instr->dest.ssa));
-   } else {
-      nir_dest_copy(&new_instr->dest, &instr->dest, mem_ctx);
-   }
-
-   nir_instr_insert_before(&instr->instr, &new_instr->instr);
-   nir_instr_remove(&instr->instr);
-
-   return true;
-}
+struct lower_system_values_state {
+   nir_builder builder;
+   bool progress;
+};
  
  static bool
-convert_block(nir_block *block, void *state)
+convert_block(nir_block *block, void *void_state)
  {
-   bool *progress = state;
+   struct lower_system_values_state *state = void_state;
+
+   nir_builder *b = &state->builder;
  
     nir_foreach_instr_safe(block, instr) {
-      if (instr->type == nir_instr_type_intrinsic)
-         *progress = convert_instr(nir_instr_as_intrinsic(instr)) || *progress;
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *load_var = nir_instr_as_intrinsic(instr);
+
+      if (load_var->intrinsic != nir_intrinsic_load_var)
+         continue;
+
+      nir_variable *var = load_var->variables[0]->var;
+      if (var->data.mode != nir_var_system_value)
+         continue;
+
+      b->cursor = nir_after_instr(&load_var->instr);
+
+      nir_ssa_def *sysval;
+      switch (var->data.location) {
+      case SYSTEM_VALUE_GLOBAL_INVOCATION_ID: {
+         /* From the GLSL man page for gl_GlobalInvocationID:
+          *
+          *    "The value of gl_GlobalInvocationID is equal to
+          *    gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID"
+          */
+
+         nir_const_value local_size;
+         local_size.u[0] = b->shader->info.cs.local_size[0];
+         local_size.u[1] = b->shader->info.cs.local_size[1];
+         local_size.u[2] = b->shader->info.cs.local_size[2];
+
+         nir_ssa_def *group_id =
+            nir_load_system_value(b, nir_intrinsic_load_work_group_id, 0);
+         nir_ssa_def *local_id =
+            nir_load_system_value(b, nir_intrinsic_load_local_invocation_id, 0);
+
+         sysval = nir_iadd(b, nir_imul(b, group_id,
+                                          nir_build_imm(b, 3, local_size)),
+                              local_id);
+         break;
+      }
+
+      case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX: {
+         /* From the GLSL man page for gl_LocalInvocationIndex:
+          *
+          *    ?The value of gl_LocalInvocationIndex is equal to
+          *    gl_LocalInvocationID.z * gl_WorkGroupSize.x *
+          *    gl_WorkGroupSize.y + gl_LocalInvocationID.y *
+          *    gl_WorkGroupSize.x + gl_LocalInvocationID.x"
+          */
+         nir_ssa_def *local_id =
+            nir_load_system_value(b, nir_intrinsic_load_invocation_id, 0);
+
+         unsigned stride_y = b->shader->info.cs.local_size[0];
+         unsigned stride_z = b->shader->info.cs.local_size[0] *
+                             b->shader->info.cs.local_size[1];
+
+         sysval = nir_iadd(b, nir_imul(b, nir_channel(b, local_id, 2),
+                                          nir_imm_int(b, stride_z)),
+                              nir_iadd(b, nir_imul(b, nir_channel(b, local_id, 1),
+                                                      nir_imm_int(b, stride_y)),
+                                          nir_channel(b, local_id, 0)));
+         break;
+      }
+
+      case SYSTEM_VALUE_VERTEX_ID:
+         if (b->shader->options->vertex_id_zero_based) {
+            sysval = nir_iadd(b,
+               nir_load_system_value(b, nir_intrinsic_load_vertex_id_zero_base, 0),
+               nir_load_system_value(b, nir_intrinsic_load_base_vertex, 0));
+         } else {
+            sysval = nir_load_system_value(b, nir_intrinsic_load_vertex_id, 0);
+         }
+         break;
+
+      default: {
+         nir_intrinsic_op sysval_op =
+            nir_intrinsic_from_system_value(var->data.location);
+         sysval = nir_load_system_value(b, sysval_op, 0);
+         break;
+      } /* default */
+      }
+
+      nir_ssa_def_rewrite_uses(&load_var->dest.ssa, nir_src_for_ssa(sysval));
+      nir_instr_remove(&load_var->instr);
+
+      state->progress = true;
     }
  
     return true;
@@ -74,12 +133,15 @@ convert_block(nir_block *block, void *state)
  static bool
  convert_impl(nir_function_impl *impl)
  {
-   bool progress = false;
+   struct lower_system_values_state state;
+
+   state.progress = false;
+   nir_builder_init(&state.builder, impl);
  
-   nir_foreach_block(impl, convert_block, &progress);
+   nir_foreach_block(impl, convert_block, &state);
     nir_metadata_preserve(impl, nir_metadata_block_index |
                                 nir_metadata_dominance);
-   return progress;
+   return state.progress;
  }
  
  bool
@@ -87,9 +149,9 @@ nir_lower_system_values(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress = convert_impl(overload->impl) || progress;
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = convert_impl(function->impl) || progress;
     }
  
     exec_list_make_empty(&shader->system_values);
diff --git a/src/glsl/nir/nir_lower_tex.c b/src/glsl/nir/nir_lower_tex.c

index 93ebf8e..ae24fb2 100644 (file)
--- a/src/glsl/nir/nir_lower_tex.c
+++ b/src/glsl/nir/nir_lower_tex.c
@@ -346,9 +346,9 @@ nir_lower_tex(nir_shader *shader, const nir_lower_tex_options *options)
     state.options = options;
     state.progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_lower_tex_impl(overload->impl, &state);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_lower_tex_impl(function->impl, &state);
     }
  
     return state.progress;
diff --git a/src/glsl/nir/nir_lower_to_source_mods.c b/src/glsl/nir/nir_lower_to_source_mods.c

index 94c7e36..6c4e1f0 100644 (file)
--- a/src/glsl/nir/nir_lower_to_source_mods.c
+++ b/src/glsl/nir/nir_lower_to_source_mods.c
@@ -189,8 +189,8 @@ nir_lower_to_source_mods_impl(nir_function_impl *impl)
  void
  nir_lower_to_source_mods(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_lower_to_source_mods_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_lower_to_source_mods_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_lower_two_sided_color.c b/src/glsl/nir/nir_lower_two_sided_color.c

index 7df12e0..1294cb8 100644 (file)
--- a/src/glsl/nir/nir_lower_two_sided_color.c
+++ b/src/glsl/nir/nir_lower_two_sided_color.c
@@ -204,9 +204,9 @@ nir_lower_two_sided_color(nir_shader *shader)
     if (setup_inputs(&state) != 0)
        return;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_lower_two_sided_color_impl(overload->impl, &state);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_lower_two_sided_color_impl(function->impl, &state);
     }
  
  }
diff --git a/src/glsl/nir/nir_lower_var_copies.c b/src/glsl/nir/nir_lower_var_copies.c

index 98c107a..350e99c 100644 (file)
--- a/src/glsl/nir/nir_lower_var_copies.c
+++ b/src/glsl/nir/nir_lower_var_copies.c
@@ -128,6 +128,7 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr,
        nir_intrinsic_instr *store =
           nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_store_var);
        store->num_components = num_components;
+      store->const_index[0] = (1 << num_components) - 1;
        store->variables[0] = nir_deref_as_var(nir_copy_deref(store, &dest_head->deref));
  
        store->src[0].is_ssa = true;
@@ -182,8 +183,8 @@ lower_var_copies_impl(nir_function_impl *impl)
  void
  nir_lower_var_copies(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         lower_var_copies_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         lower_var_copies_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_lower_vars_to_ssa.c b/src/glsl/nir/nir_lower_vars_to_ssa.c

index e670dbd..e1f368d 100644 (file)
--- a/src/glsl/nir/nir_lower_vars_to_ssa.c
+++ b/src/glsl/nir/nir_lower_vars_to_ssa.c
@@ -26,6 +26,8 @@
   */
  
  #include "nir.h"
+#include "nir_builder.h"
+#include "nir_phi_builder.h"
  #include "nir_vla.h"
  
  
@@ -46,8 +48,7 @@ struct deref_node {
     struct set *stores;
     struct set *copies;
  
-   nir_ssa_def **def_stack;
-   nir_ssa_def **def_stack_tail;
+   struct nir_phi_builder_value *pb_value;
  
     struct deref_node *wildcard;
     struct deref_node *indirect;
@@ -86,8 +87,7 @@ struct lower_variables_state {
      */
     bool add_to_direct_deref_nodes;
  
-   /* A hash table mapping phi nodes to deref_state data */
-   struct hash_table *phi_table;
+   struct nir_phi_builder *phi_builder;
  };
  
  static struct deref_node *
@@ -472,114 +472,6 @@ lower_copies_to_load_store(struct deref_node *node,
     return true;
  }
  
-/** Pushes an SSA def onto the def stack for the given node
- *
- * Each node is potentially associated with a stack of SSA definitions.
- * This stack is used for determining what SSA definition reaches a given
- * point in the program for variable renaming.  The stack is always kept in
- * dominance-order with at most one SSA def per block.  If the SSA
- * definition on the top of the stack is in the same block as the one being
- * pushed, the top element is replaced.
- */
-static void
-def_stack_push(struct deref_node *node, nir_ssa_def *def,
-               struct lower_variables_state *state)
-{
-   if (node->def_stack == NULL) {
-      node->def_stack = ralloc_array(state->dead_ctx, nir_ssa_def *,
-                                     state->impl->num_blocks);
-      node->def_stack_tail = node->def_stack - 1;
-   }
-
-   if (node->def_stack_tail >= node->def_stack) {
-      nir_ssa_def *top_def = *node->def_stack_tail;
-
-      if (def->parent_instr->block == top_def->parent_instr->block) {
-         /* They're in the same block, just replace the top */
-         *node->def_stack_tail = def;
-         return;
-      }
-   }
-
-   *(++node->def_stack_tail) = def;
-}
-
-/* Pop the top of the def stack if it's in the given block */
-static void
-def_stack_pop_if_in_block(struct deref_node *node, nir_block *block)
-{
-   /* If we're popping, then we have presumably pushed at some time in the
-    * past so this should exist.
-    */
-   assert(node->def_stack != NULL);
-
-   /* The stack is already empty.  Do nothing. */
-   if (node->def_stack_tail < node->def_stack)
-      return;
-
-   nir_ssa_def *def = *node->def_stack_tail;
-   if (def->parent_instr->block == block)
-      node->def_stack_tail--;
-}
-
-/** Retrieves the SSA definition on the top of the stack for the given
- * node, if one exists.  If the stack is empty, then we return the constant
- * initializer (if it exists) or an SSA undef.
- */
-static nir_ssa_def *
-get_ssa_def_for_block(struct deref_node *node, nir_block *block,
-                      struct lower_variables_state *state)
-{
-   /* If we have something on the stack, go ahead and return it.  We're
-    * assuming that the top of the stack dominates the given block.
-    */
-   if (node->def_stack && node->def_stack_tail >= node->def_stack)
-      return *node->def_stack_tail;
-
-   /* If we got here then we don't have a definition that dominates the
-    * given block.  This means that we need to add an undef and use that.
-    */
-   nir_ssa_undef_instr *undef =
-      nir_ssa_undef_instr_create(state->shader,
-                                 glsl_get_vector_elements(node->type));
-   nir_instr_insert_before_cf_list(&state->impl->body, &undef->instr);
-   def_stack_push(node, &undef->def, state);
-   return &undef->def;
-}
-
-/* Given a block and one of its predecessors, this function fills in the
- * souces of the phi nodes to take SSA defs from the given predecessor.
- * This function must be called exactly once per block/predecessor pair.
- */
-static void
-add_phi_sources(nir_block *block, nir_block *pred,
-                struct lower_variables_state *state)
-{
-   nir_foreach_instr(block, instr) {
-      if (instr->type != nir_instr_type_phi)
-         break;
-
-      nir_phi_instr *phi = nir_instr_as_phi(instr);
-
-      struct hash_entry *entry =
-            _mesa_hash_table_search(state->phi_table, phi);
-      if (!entry)
-         continue;
-
-      struct deref_node *node = entry->data;
-
-      nir_phi_src *src = ralloc(phi, nir_phi_src);
-      src->pred = pred;
-      src->src.parent_instr = &phi->instr;
-      src->src.is_ssa = true;
-      src->src.ssa = get_ssa_def_for_block(node, pred, state);
-
-      list_addtail(&src->src.use_link, &src->src.ssa->uses);
-
-      exec_list_push_tail(&phi->srcs, &src->node);
-   }
-}
-
  /* Performs variable renaming by doing a DFS of the dominance tree
   *
   * This algorithm is very similar to the one outlined in "Efficiently
@@ -590,246 +482,130 @@ add_phi_sources(nir_block *block, nir_block *pred,
  static bool
  rename_variables_block(nir_block *block, struct lower_variables_state *state)
  {
-   nir_foreach_instr_safe(block, instr) {
-      if (instr->type == nir_instr_type_phi) {
-         nir_phi_instr *phi = nir_instr_as_phi(instr);
-
-         struct hash_entry *entry =
-            _mesa_hash_table_search(state->phi_table, phi);
-
-         /* This can happen if we already have phi nodes in the program
-          * that were not created in this pass.
-          */
-         if (!entry)
-            continue;
-
-         struct deref_node *node = entry->data;
-
-         def_stack_push(node, &phi->dest.ssa, state);
-      } else if (instr->type == nir_instr_type_intrinsic) {
-         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-         switch (intrin->intrinsic) {
-         case nir_intrinsic_load_var: {
-            struct deref_node *node =
-               get_deref_node(intrin->variables[0], state);
-
-            if (node == NULL) {
-               /* If we hit this path then we are referencing an invalid
-                * value.  Most likely, we unrolled something and are
-                * reading past the end of some array.  In any case, this
-                * should result in an undefined value.
-                */
-               nir_ssa_undef_instr *undef =
-                  nir_ssa_undef_instr_create(state->shader,
-                                             intrin->num_components);
-
-               nir_instr_insert_before(&intrin->instr, &undef->instr);
-               nir_instr_remove(&intrin->instr);
-
-               nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
-                                        nir_src_for_ssa(&undef->def));
-               continue;
-            }
+   nir_builder b;
+   nir_builder_init(&b, state->impl);
  
-            if (!node->lower_to_ssa)
-               continue;
-
-            nir_alu_instr *mov = nir_alu_instr_create(state->shader,
-                                                      nir_op_imov);
-            mov->src[0].src.is_ssa = true;
-            mov->src[0].src.ssa = get_ssa_def_for_block(node, block, state);
-            for (unsigned i = intrin->num_components; i < 4; i++)
-               mov->src[0].swizzle[i] = 0;
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
  
-            assert(intrin->dest.is_ssa);
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
  
-            mov->dest.write_mask = (1 << intrin->num_components) - 1;
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                              intrin->num_components, NULL);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_var: {
+         struct deref_node *node =
+            get_deref_node(intrin->variables[0], state);
+
+         if (node == NULL) {
+            /* If we hit this path then we are referencing an invalid
+             * value.  Most likely, we unrolled something and are
+             * reading past the end of some array.  In any case, this
+             * should result in an undefined value.
+             */
+            nir_ssa_undef_instr *undef =
+               nir_ssa_undef_instr_create(state->shader,
+                                          intrin->num_components);
  
-            nir_instr_insert_before(&intrin->instr, &mov->instr);
+            nir_instr_insert_before(&intrin->instr, &undef->instr);
              nir_instr_remove(&intrin->instr);
  
              nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
-                                     nir_src_for_ssa(&mov->dest.dest.ssa));
-            break;
+                                     nir_src_for_ssa(&undef->def));
+            continue;
           }
  
-         case nir_intrinsic_store_var: {
-            struct deref_node *node =
-               get_deref_node(intrin->variables[0], state);
-
-            if (node == NULL) {
-               /* Probably an out-of-bounds array store.  That should be a
-                * no-op. */
-               nir_instr_remove(&intrin->instr);
-               continue;
-            }
-
-            if (!node->lower_to_ssa)
-               continue;
-
-            assert(intrin->num_components ==
-                   glsl_get_vector_elements(node->type));
-
-            assert(intrin->src[0].is_ssa);
-
-            nir_alu_instr *mov = nir_alu_instr_create(state->shader,
-                                                      nir_op_imov);
-            mov->src[0].src.is_ssa = true;
-            mov->src[0].src.ssa = intrin->src[0].ssa;
-            for (unsigned i = intrin->num_components; i < 4; i++)
-               mov->src[0].swizzle[i] = 0;
+         if (!node->lower_to_ssa)
+            continue;
  
-            mov->dest.write_mask = (1 << intrin->num_components) - 1;
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                              intrin->num_components, NULL);
+         nir_alu_instr *mov = nir_alu_instr_create(state->shader,
+                                                   nir_op_imov);
+         mov->src[0].src = nir_src_for_ssa(
+            nir_phi_builder_value_get_block_def(node->pb_value, block));
+         for (unsigned i = intrin->num_components; i < 4; i++)
+            mov->src[0].swizzle[i] = 0;
  
-            nir_instr_insert_before(&intrin->instr, &mov->instr);
+         assert(intrin->dest.is_ssa);
  
-            def_stack_push(node, &mov->dest.dest.ssa, state);
+         mov->dest.write_mask = (1 << intrin->num_components) - 1;
+         nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
+                           intrin->num_components, NULL);
  
-            /* We'll wait to remove the instruction until the next pass
-             * where we pop the node we just pushed back off the stack.
-             */
-            break;
-         }
+         nir_instr_insert_before(&intrin->instr, &mov->instr);
+         nir_instr_remove(&intrin->instr);
  
-         default:
-            break;
-         }
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                  nir_src_for_ssa(&mov->dest.dest.ssa));
+         break;
        }
-   }
  
-   if (block->successors[0])
-      add_phi_sources(block->successors[0], block, state);
-   if (block->successors[1])
-      add_phi_sources(block->successors[1], block, state);
+      case nir_intrinsic_store_var: {
+         struct deref_node *node =
+            get_deref_node(intrin->variables[0], state);
  
-   for (unsigned i = 0; i < block->num_dom_children; ++i)
-      rename_variables_block(block->dom_children[i], state);
-
-   /* Now we iterate over the instructions and pop off any SSA defs that we
-    * pushed in the first loop.
-    */
-   nir_foreach_instr_safe(block, instr) {
-      if (instr->type == nir_instr_type_phi) {
-         nir_phi_instr *phi = nir_instr_as_phi(instr);
-
-         struct hash_entry *entry =
-            _mesa_hash_table_search(state->phi_table, phi);
-
-         /* This can happen if we already have phi nodes in the program
-          * that were not created in this pass.
-          */
-         if (!entry)
-            continue;
-
-         struct deref_node *node = entry->data;
-
-         def_stack_pop_if_in_block(node, block);
-      } else if (instr->type == nir_instr_type_intrinsic) {
-         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-         if (intrin->intrinsic != nir_intrinsic_store_var)
-            continue;
-
-         struct deref_node *node = get_deref_node(intrin->variables[0], state);
-         if (!node)
+         if (node == NULL) {
+            /* Probably an out-of-bounds array store.  That should be a
+             * no-op. */
+            nir_instr_remove(&intrin->instr);
              continue;
+         }
  
           if (!node->lower_to_ssa)
              continue;
  
-         def_stack_pop_if_in_block(node, block);
-         nir_instr_remove(&intrin->instr);
-      }
-   }
+         assert(intrin->num_components ==
+                glsl_get_vector_elements(node->type));
  
-   return true;
-}
+         assert(intrin->src[0].is_ssa);
  
-/* Inserts phi nodes for all variables marked lower_to_ssa
- *
- * This is the same algorithm as presented in "Efficiently Computing Static
- * Single Assignment Form and the Control Dependence Graph" by Cytron et.
- * al.
- */
-static void
-insert_phi_nodes(struct lower_variables_state *state)
-{
-   NIR_VLA_ZERO(unsigned, work, state->impl->num_blocks);
-   NIR_VLA_ZERO(unsigned, has_already, state->impl->num_blocks);
-
-   /*
-    * Since the work flags already prevent us from inserting a node that has
-    * ever been inserted into W, we don't need to use a set to represent W.
-    * Also, since no block can ever be inserted into W more than once, we know
-    * that the maximum size of W is the number of basic blocks in the
-    * function. So all we need to handle W is an array and a pointer to the
-    * next element to be inserted and the next element to be removed.
-    */
-   NIR_VLA(nir_block *, W, state->impl->num_blocks);
-
-   unsigned w_start, w_end;
-   unsigned iter_count = 0;
-
-   foreach_list_typed(struct deref_node, node, direct_derefs_link,
-                      &state->direct_deref_nodes) {
-      if (node->stores == NULL)
-         continue;
+         nir_ssa_def *new_def;
+         b.cursor = nir_before_instr(&intrin->instr);
  
-      if (!node->lower_to_ssa)
-         continue;
-
-      w_start = w_end = 0;
-      iter_count++;
-
-      struct set_entry *store_entry;
-      set_foreach(node->stores, store_entry) {
-         nir_intrinsic_instr *store = (nir_intrinsic_instr *)store_entry->key;
-         if (work[store->instr.block->index] < iter_count)
-            W[w_end++] = store->instr.block;
-         work[store->instr.block->index] = iter_count;
-      }
-
-      while (w_start != w_end) {
-         nir_block *cur = W[w_start++];
-         struct set_entry *dom_entry;
-         set_foreach(cur->dom_frontier, dom_entry) {
-            nir_block *next = (nir_block *) dom_entry->key;
-
-            /*
-             * If there's more than one return statement, then the end block
-             * can be a join point for some definitions. However, there are
-             * no instructions in the end block, so nothing would use those
-             * phi nodes. Of course, we couldn't place those phi nodes
-             * anyways due to the restriction of having no instructions in the
-             * end block...
+         if (intrin->const_index[0] == (1 << intrin->num_components) - 1) {
+            /* Whole variable store - just copy the source.  Note that
+             * intrin->num_components and intrin->src[0].ssa->num_components
+             * may differ.
               */
-            if (next == state->impl->end_block)
-               continue;
-
-            if (has_already[next->index] < iter_count) {
-               nir_phi_instr *phi = nir_phi_instr_create(state->shader);
-               nir_ssa_dest_init(&phi->instr, &phi->dest,
-                                 glsl_get_vector_elements(node->type), NULL);
-               nir_instr_insert_before_block(next, &phi->instr);
+            unsigned swiz[4];
+            for (unsigned i = 0; i < 4; i++)
+               swiz[i] = i < intrin->num_components ? i : 0;
  
-               _mesa_hash_table_insert(state->phi_table, phi, node);
-
-               has_already[next->index] = iter_count;
-               if (work[next->index] < iter_count) {
-                  work[next->index] = iter_count;
-                  W[w_end++] = next;
+            new_def = nir_swizzle(&b, intrin->src[0].ssa, swiz,
+                                  intrin->num_components, false);
+         } else {
+            nir_ssa_def *old_def =
+               nir_phi_builder_value_get_block_def(node->pb_value, block);
+            /* For writemasked store_var intrinsics, we combine the newly
+             * written values with the existing contents of unwritten
+             * channels, creating a new SSA value for the whole vector.
+             */
+            nir_ssa_def *srcs[4];
+            for (unsigned i = 0; i < intrin->num_components; i++) {
+               if (intrin->const_index[0] & (1 << i)) {
+                  srcs[i] = nir_channel(&b, intrin->src[0].ssa, i);
+               } else {
+                  srcs[i] = nir_channel(&b, old_def, i);
                 }
              }
+            new_def = nir_vec(&b, srcs, intrin->num_components);
           }
+
+         assert(new_def->num_components == intrin->num_components);
+
+         nir_phi_builder_value_set_block_def(node->pb_value, block, new_def);
+         nir_instr_remove(&intrin->instr);
+         break;
+      }
+
+      default:
+         break;
        }
     }
-}
  
+   for (unsigned i = 0; i < block->num_dom_children; ++i)
+      rename_variables_block(block->dom_children[i], state);
+
+   return true;
+}
  
  /** Implements a pass to lower variable uses to SSA values
   *
@@ -863,7 +639,7 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
  {
     struct lower_variables_state state;
  
-   state.shader = impl->overload->function->shader;
+   state.shader = impl->function->shader;
     state.dead_ctx = ralloc_context(state.shader);
     state.impl = impl;
  
@@ -871,9 +647,6 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
                                                     _mesa_hash_pointer,
                                                     _mesa_key_pointer_equal);
     exec_list_make_empty(&state.direct_deref_nodes);
-   state.phi_table = _mesa_hash_table_create(state.dead_ctx,
-                                             _mesa_hash_pointer,
-                                             _mesa_key_pointer_equal);
  
     /* Build the initial deref structures and direct_deref_nodes table */
     state.add_to_direct_deref_nodes = true;
@@ -903,15 +676,6 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
        node->lower_to_ssa = true;
        progress = true;
  
-      if (deref->var->constant_initializer) {
-         nir_load_const_instr *load =
-            nir_deref_get_const_initializer_load(state.shader, deref);
-         nir_ssa_def_init(&load->instr, &load->def,
-                          glsl_get_vector_elements(node->type), NULL);
-         nir_instr_insert_before_cf_list(&impl->body, &load->instr);
-         def_stack_push(node, &load->def, &state);
-      }
-
        foreach_deref_node_match(deref, lower_copies_to_load_store, &state);
     }
  
@@ -928,9 +692,47 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
      */
     nir_foreach_block(impl, register_variable_uses_block, &state);
  
-   insert_phi_nodes(&state);
+   state.phi_builder = nir_phi_builder_create(state.impl);
+
+   NIR_VLA(BITSET_WORD, store_blocks, BITSET_WORDS(state.impl->num_blocks));
+   foreach_list_typed(struct deref_node, node, direct_derefs_link,
+                      &state.direct_deref_nodes) {
+      if (!node->lower_to_ssa)
+         continue;
+
+      memset(store_blocks, 0,
+             BITSET_WORDS(state.impl->num_blocks) * sizeof(*store_blocks));
+
+      if (node->stores) {
+         struct set_entry *store_entry;
+         set_foreach(node->stores, store_entry) {
+            nir_intrinsic_instr *store =
+               (nir_intrinsic_instr *)store_entry->key;
+            BITSET_SET(store_blocks, store->instr.block->index);
+         }
+      }
+
+      if (node->deref->var->constant_initializer)
+         BITSET_SET(store_blocks, 0);
+
+      node->pb_value =
+         nir_phi_builder_add_value(state.phi_builder,
+                                   glsl_get_vector_elements(node->type),
+                                   store_blocks);
+
+      if (node->deref->var->constant_initializer) {
+         nir_load_const_instr *load =
+            nir_deref_get_const_initializer_load(state.shader, node->deref);
+         nir_instr_insert_before_cf_list(&impl->body, &load->instr);
+         nir_phi_builder_value_set_block_def(node->pb_value,
+                                             nir_start_block(impl), &load->def);
+      }
+   }
+
     rename_variables_block(nir_start_block(impl), &state);
  
+   nir_phi_builder_finish(state.phi_builder);
+
     nir_metadata_preserve(impl, nir_metadata_block_index |
                                 nir_metadata_dominance);
  
@@ -942,8 +744,8 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
  void
  nir_lower_vars_to_ssa(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_lower_vars_to_ssa_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_lower_vars_to_ssa_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c

index 736a66c..06d6279 100644 (file)
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -217,7 +217,7 @@ lower_vec_to_movs_block(nir_block *block, void *void_state)
  {
     struct vec_to_movs_state *state = void_state;
     nir_function_impl *impl = state->impl;
-   nir_shader *shader = impl->overload->function->shader;
+   nir_shader *shader = impl->function->shader;
  
     nir_foreach_instr_safe(block, instr) {
        if (instr->type != nir_instr_type_alu)
@@ -301,9 +301,9 @@ nir_lower_vec_to_movs(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress = nir_lower_vec_to_movs_impl(overload->impl) || progress;
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = nir_lower_vec_to_movs_impl(function->impl) || progress;
     }
  
     return progress;
diff --git a/src/glsl/nir/nir_metadata.c b/src/glsl/nir/nir_metadata.c

index d5324b3..61aae73 100644 (file)
--- a/src/glsl/nir/nir_metadata.c
+++ b/src/glsl/nir/nir_metadata.c
@@ -63,9 +63,9 @@ nir_metadata_preserve(nir_function_impl *impl, nir_metadata preserved)
  void
  nir_metadata_set_validation_flag(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
-         overload->impl->valid_metadata |= nir_metadata_not_properly_reset;
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         function->impl->valid_metadata |= nir_metadata_not_properly_reset;
        }
     }
  }
@@ -80,9 +80,9 @@ nir_metadata_set_validation_flag(nir_shader *shader)
  void
  nir_metadata_check_validation_flag(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
-         assert(!(overload->impl->valid_metadata &
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         assert(!(function->impl->valid_metadata &
                    nir_metadata_not_properly_reset));
        }
     }
diff --git a/src/glsl/nir/nir_move_vec_src_uses_to_dest.c b/src/glsl/nir/nir_move_vec_src_uses_to_dest.c

index 4c9032d..b5186e6 100644 (file)
--- a/src/glsl/nir/nir_move_vec_src_uses_to_dest.c
+++ b/src/glsl/nir/nir_move_vec_src_uses_to_dest.c
@@ -190,8 +190,8 @@ nir_move_vec_src_uses_to_dest_impl(nir_shader *shader, nir_function_impl *impl)
  void
  nir_move_vec_src_uses_to_dest(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_move_vec_src_uses_to_dest_impl(shader, overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_move_vec_src_uses_to_dest_impl(shader, function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_normalize_cubemap_coords.c b/src/glsl/nir/nir_normalize_cubemap_coords.c

index 7385576..9c15eb8 100644 (file)
--- a/src/glsl/nir/nir_normalize_cubemap_coords.c
+++ b/src/glsl/nir/nir_normalize_cubemap_coords.c
@@ -111,9 +111,9 @@ nir_normalize_cubemap_coords(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress = normalize_cubemap_coords_impl(overload->impl) || progress;
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = normalize_cubemap_coords_impl(function->impl) || progress;
     }
  
     return progress;
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py

index 37d3dfc..1b17620 100644 (file)
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -167,13 +167,6 @@ unop_convert("i2b", tint, tbool, "src0 != 0")
  unop_convert("b2i", tbool, tint, "src0 ? 1 : 0") # Boolean-to-int conversion
  unop_convert("u2f", tuint, tfloat, "src0") # Unsigned-to-float conversion.
  
-unop_reduce("bany", 1, tbool, tbool, "{src}", "{src0} || {src1}", "{src}")
-unop_reduce("ball", 1, tbool, tbool, "{src}", "{src0} && {src1}", "{src}")
-unop_reduce("fany", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} || {src1}",
-            "{src} ? 1.0f : 0.0f")
-unop_reduce("fall", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} && {src1}",
-            "{src} ? 1.0f : 0.0f")
-
  # Unary floating-point rounding operations.
  
  
@@ -183,6 +176,7 @@ unop("ffloor", tfloat, "floorf(src0)")
  unop("ffract", tfloat, "src0 - floorf(src0)")
  unop("fround_even", tfloat, "_mesa_roundevenf(src0)")
  
+unop("fquantize2f16", tfloat, "_mesa_half_to_float(_mesa_float_to_half(src0))")
  
  # Trigonometric operations.
  
@@ -368,16 +362,30 @@ binop("udiv", tuint, "", "src0 / src1")
  # returns a boolean representing the carry resulting from the addition of
  # the two unsigned arguments.
  
-binop_convert("uadd_carry", tbool, tuint, commutative, "src0 + src1 < src0")
+binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
  
  # returns a boolean representing the borrow resulting from the subtraction
  # of the two unsigned arguments.
  
-binop_convert("usub_borrow", tbool, tuint, "", "src1 < src0")
+binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
  
-binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
  binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
  
+# For signed integers, there are several different possible definitions of
+# "modulus" or "remainder".  We follow the conventions used by LLVM and
+# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
+# operation while the imod opcode implements the more mathematical
+# "modulus" operation.  For details on the difference, see
+#
+# http://mathforum.org/library/drmath/view/52343.html
+
+binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
+binop("imod", tint, "",
+      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
+      "                 src0 % src1 : src0 % src1 + src1)")
+binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
+binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
+
  #
  # Comparisons
  #
@@ -518,12 +526,15 @@ binop("fpow", tfloat, "", "powf(src0, src1)")
  binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat,
              "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
  
+# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
+# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
+# if either of its arguments are 32.
  binop_convert("bfm", tuint, tint, "", """
-int offset = src0, bits = src1;
-if (offset < 0 || bits < 0 || offset + bits > 32)
-   dst = 0; /* undefined per the spec */
+int bits = src0, offset = src1;
+if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
+   dst = 0; /* undefined */
  else
-   dst = ((1 << bits)- 1) << offset;
+   dst = ((1u << bits) - 1) << offset;
  """)
  
  opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
@@ -562,6 +573,7 @@ triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
  opcode("bcsel", 0, tuint, [0, 0, 0],
        [tbool, tuint, tuint], "", "src0 ? src1 : src2")
  
+# SM5 bfi assembly
  triop("bfi", tuint, """
  unsigned mask = src0, insert = src1, base = src2;
  if (mask == 0) {
@@ -576,22 +588,53 @@ if (mask == 0) {
  }
  """)
  
+# SM5 ubfe/ibfe assembly
+opcode("ubfe", 0, tuint,
+       [0, 0, 0], [tuint, tint, tint], "", """
+unsigned base = src0;
+int offset = src1, bits = src2;
+if (bits == 0) {
+   dst = 0;
+} else if (bits < 0 || offset < 0) {
+   dst = 0; /* undefined */
+} else if (offset + bits < 32) {
+   dst = (base << (32 - bits - offset)) >> (32 - bits);
+} else {
+   dst = base >> offset;
+}
+""")
+opcode("ibfe", 0, tint,
+       [0, 0, 0], [tint, tint, tint], "", """
+int base = src0;
+int offset = src1, bits = src2;
+if (bits == 0) {
+   dst = 0;
+} else if (bits < 0 || offset < 0) {
+   dst = 0; /* undefined */
+} else if (offset + bits < 32) {
+   dst = (base << (32 - bits - offset)) >> (32 - bits);
+} else {
+   dst = base >> offset;
+}
+""")
+
+# GLSL bitfieldExtract()
  opcode("ubitfield_extract", 0, tuint,
-       [0, 1, 1], [tuint, tint, tint], "", """
+       [0, 0, 0], [tuint, tint, tint], "", """
  unsigned base = src0;
-int offset = src1.x, bits = src2.x;
+int offset = src1, bits = src2;
  if (bits == 0) {
     dst = 0;
  } else if (bits < 0 || offset < 0 || offset + bits > 32) {
     dst = 0; /* undefined per the spec */
  } else {
-   dst = (base >> offset) & ((1 << bits) - 1);
+   dst = (base >> offset) & ((1ull << bits) - 1);
  }
  """)
  opcode("ibitfield_extract", 0, tint,
-       [0, 1, 1], [tint, tint, tint], "", """
+       [0, 0, 0], [tint, tint, tint], "", """
  int base = src0;
-int offset = src1.x, bits = src2.x;
+int offset = src1, bits = src2;
  if (bits == 0) {
     dst = 0;
  } else if (offset < 0 || bits < 0 || offset + bits > 32) {
@@ -616,16 +659,16 @@ def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
            [tuint, tuint, tuint, tuint],
            "", const_expr)
  
-opcode("bitfield_insert", 0, tuint, [0, 0, 1, 1],
+opcode("bitfield_insert", 0, tuint, [0, 0, 0, 0],
         [tuint, tuint, tint, tint], "", """
  unsigned base = src0, insert = src1;
-int offset = src2.x, bits = src3.x;
+int offset = src2, bits = src3;
  if (bits == 0) {
     dst = 0;
  } else if (offset < 0 || bits < 0 || bits + offset > 32) {
     dst = 0;
  } else {
-   unsigned mask = ((1 << bits) - 1) << offset;
+   unsigned mask = ((1ull << bits) - 1) << offset;
     dst = (base & ~mask) | ((insert << bits) & mask);
  }
  """)
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py

index cb715c0..a46cbf7 100644 (file)
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -1,4 +1,5 @@
  #! /usr/bin/env python
+# -*- encoding: utf-8 -*-
  #
  # Copyright (C) 2014 Intel Corporation
  #
@@ -62,6 +63,10 @@ optimizations = [
     (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
     (('fadd', ('fneg', a), a), 0.0),
     (('iadd', ('ineg', a), a), 0),
+   (('iadd', ('ineg', a), ('iadd', a, b)), b),
+   (('iadd', a, ('iadd', ('ineg', a), b)), b),
+   (('fadd', ('fneg', a), ('fadd', a, b)), b),
+   (('fadd', a, ('fadd', ('fneg', a), b)), b),
     (('fmul', a, 0.0), 0.0),
     (('imul', a, 0), 0),
     (('umul_unorm_4x8', a, 0), 0),
@@ -70,6 +75,7 @@ optimizations = [
     (('imul', a, 1), a),
     (('fmul', a, -1.0), ('fneg', a)),
     (('imul', a, -1), ('ineg', a)),
+   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
     (('ffma', 0.0, a, b), b),
     (('ffma', a, 0.0, b), b),
     (('ffma', a, b, 0.0), ('fmul', a, b)),
@@ -179,6 +185,7 @@ optimizations = [
     (('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
     # Division and reciprocal
     (('fdiv', 1.0, a), ('frcp', a)),
+   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
     (('frcp', ('frcp', a)), a),
     (('frcp', ('fsqrt', a)), ('frsq', a)),
     (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
@@ -217,6 +224,27 @@ optimizations = [
     (('iadd', a, ('isub', 0, b)), ('isub', a, b)),
     (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
     (('iabs', ('isub', 0, a)), ('iabs', a)),
+
+   # Misc. lowering
+   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
+   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
+   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
+
+   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
+    ('bcsel', ('ilt', 31, 'bits'), 'insert',
+              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
+    'options->lower_bitfield_insert'),
+
+   (('ibitfield_extract', 'value', 'offset', 'bits'),
+    ('bcsel', ('ilt', 31, 'bits'), 'value',
+              ('ibfe', 'value', 'offset', 'bits')),
+    'options->lower_bitfield_extract'),
+
+   (('ubitfield_extract', 'value', 'offset', 'bits'),
+    ('bcsel', ('ult', 31, 'bits'), 'value',
+              ('ubfe', 'value', 'offset', 'bits')),
+    'options->lower_bitfield_extract'),
  ]
  
  # Add optimizations to handle the case where the result of a ternary is
@@ -240,6 +268,68 @@ for op in ['flt', 'fge', 'feq', 'fne',
         ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
     ]
  
+def ldexp_to_arith(x, exp):
+   """
+   Translates
+      ldexp x exp
+   into
+
+      extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
+      resulting_biased_exp = extracted_biased_exp + exp;
+
+      if (resulting_biased_exp < 1) {
+         return copysign(0.0, x);
+      }
+
+      return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
+                         lshift(i2u(resulting_biased_exp), exp_shift));
+
+   which we can't actually implement as such, since NIR doesn't have
+   vectorized if-statements. We actually implement it without branches
+   using conditional-select:
+
+      extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
+      resulting_biased_exp = extracted_biased_exp + exp;
+
+      is_not_zero_or_underflow = gequal(resulting_biased_exp, 1);
+      x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
+      resulting_biased_exp = csel(is_not_zero_or_underflow,
+                                  resulting_biased_exp, 0);
+
+      return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
+                         lshift(i2u(resulting_biased_exp), exp_shift));
+   """
+
+   sign_mask = 0x80000000
+   exp_shift = 23
+   exp_width = 8
+
+   # Extract the biased exponent from <x>.
+   extracted_biased_exp = ('ushr', ('fabs', x), exp_shift)
+   resulting_biased_exp = ('iadd', extracted_biased_exp, exp)
+
+   # Test if result is ±0.0, subnormal, or underflow by checking if the
+   # resulting biased exponent would be less than 0x1. If so, the result is
+   # 0.0 with the sign of x. (Actually, invert the conditions so that
+   # immediate values are the second arguments, which is better for i965)
+   zero_sign_x = ('iand', x, sign_mask)
+
+   is_not_zero_or_underflow = ('ige', resulting_biased_exp, 0x1)
+
+   # We could test for overflows by checking if the resulting biased exponent
+   # would be greater than 0xFE. Turns out we don't need to because the GLSL
+   # spec says:
+   #
+   #    "If this product is too large to be represented in the
+   #     floating-point type, the result is undefined."
+
+   return ('bitfield_insert',
+           ('bcsel', is_not_zero_or_underflow, x, zero_sign_x),
+           ('bcsel', is_not_zero_or_underflow, resulting_biased_exp, 0),
+           exp_shift, exp_width)
+
+optimizations += [(('ldexp', 'x', 'exp'), ldexp_to_arith('x', 'exp'))]
+
  # This section contains "late" optimizations that should be run after the
  # regular optimizations have finished.  Optimizations should go here if
  # they help code generation but do not necessarily produce code that is
diff --git a/src/glsl/nir/nir_opt_constant_folding.c b/src/glsl/nir/nir_opt_constant_folding.c

index 007b81c..28a73f8 100644 (file)
--- a/src/glsl/nir/nir_opt_constant_folding.c
+++ b/src/glsl/nir/nir_opt_constant_folding.c
@@ -192,9 +192,9 @@ nir_opt_constant_folding(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress |= nir_opt_constant_folding_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress |= nir_opt_constant_folding_impl(function->impl);
     }
  
     return progress;
diff --git a/src/glsl/nir/nir_opt_copy_propagate.c b/src/glsl/nir/nir_opt_copy_propagate.c

index cfc8e33..d99f78d 100644 (file)
--- a/src/glsl/nir/nir_opt_copy_propagate.c
+++ b/src/glsl/nir/nir_opt_copy_propagate.c
@@ -281,8 +281,8 @@ nir_copy_prop(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl && nir_copy_prop_impl(overload->impl))
+   nir_foreach_function(shader, function) {
+      if (function->impl && nir_copy_prop_impl(function->impl))
           progress = true;
     }
  
diff --git a/src/glsl/nir/nir_opt_cse.c b/src/glsl/nir/nir_opt_cse.c

index 93a6635..364fb02 100644 (file)
--- a/src/glsl/nir/nir_opt_cse.c
+++ b/src/glsl/nir/nir_opt_cse.c
@@ -83,9 +83,9 @@ nir_opt_cse(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress |= nir_opt_cse_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress |= nir_opt_cse_impl(function->impl);
     }
  
     return progress;
diff --git a/src/glsl/nir/nir_opt_dce.c b/src/glsl/nir/nir_opt_dce.c

index 6032528..32436c1 100644 (file)
--- a/src/glsl/nir/nir_opt_dce.c
+++ b/src/glsl/nir/nir_opt_dce.c
@@ -174,8 +174,8 @@ bool
  nir_opt_dce(nir_shader *shader)
  {
     bool progress = false;
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl && nir_opt_dce_impl(overload->impl))
+   nir_foreach_function(shader, function) {
+      if (function->impl && nir_opt_dce_impl(function->impl))
           progress = true;
     }
  
diff --git a/src/glsl/nir/nir_opt_dead_cf.c b/src/glsl/nir/nir_opt_dead_cf.c

index 356e926..4cc6798 100644 (file)
--- a/src/glsl/nir/nir_opt_dead_cf.c
+++ b/src/glsl/nir/nir_opt_dead_cf.c
@@ -350,9 +350,9 @@ nir_opt_dead_cf(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload)
-      if (overload->impl)
-         progress |= opt_dead_cf_impl(overload->impl);
+   nir_foreach_function(shader, function)
+      if (function->impl)
+         progress |= opt_dead_cf_impl(function->impl);
  
     return progress;
  }
diff --git a/src/glsl/nir/nir_opt_gcm.c b/src/glsl/nir/nir_opt_gcm.c

index 5b412ee..a8779ce 100644 (file)
--- a/src/glsl/nir/nir_opt_gcm.c
+++ b/src/glsl/nir/nir_opt_gcm.c
@@ -487,8 +487,8 @@ opt_gcm_impl(nir_function_impl *impl)
  void
  nir_opt_gcm(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         opt_gcm_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         opt_gcm_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_opt_peephole_select.c b/src/glsl/nir/nir_opt_peephole_select.c

index 90902b9..0fc658d 100644 (file)
--- a/src/glsl/nir/nir_opt_peephole_select.c
+++ b/src/glsl/nir/nir_opt_peephole_select.c
@@ -247,9 +247,9 @@ nir_opt_peephole_select(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress |= nir_opt_peephole_select_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress |= nir_opt_peephole_select_impl(function->impl);
     }
  
     return progress;
diff --git a/src/glsl/nir/nir_opt_remove_phis.c b/src/glsl/nir/nir_opt_remove_phis.c

index 66d3754..6461837 100644 (file)
--- a/src/glsl/nir/nir_opt_remove_phis.c
+++ b/src/glsl/nir/nir_opt_remove_phis.c
@@ -121,9 +121,9 @@ nir_opt_remove_phis(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload)
-      if (overload->impl)
-         progress = remove_phis_impl(overload->impl) || progress;
+   nir_foreach_function(shader, function)
+      if (function->impl)
+         progress = remove_phis_impl(function->impl) || progress;
  
     return progress;
  }
diff --git a/src/glsl/nir/nir_opt_undef.c b/src/glsl/nir/nir_opt_undef.c

index 4ab27a8..374564d 100644 (file)
--- a/src/glsl/nir/nir_opt_undef.c
+++ b/src/glsl/nir/nir_opt_undef.c
@@ -90,11 +90,11 @@ nir_opt_undef(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
-         nir_foreach_block(overload->impl, opt_undef_block, &progress);
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         nir_foreach_block(function->impl, opt_undef_block, &progress);
           if (progress)
-            nir_metadata_preserve(overload->impl,
+            nir_metadata_preserve(function->impl,
                                    nir_metadata_block_index |
                                    nir_metadata_dominance);
        }
diff --git a/src/glsl/nir/nir_phi_builder.c b/src/glsl/nir/nir_phi_builder.c

new file mode 100644 (file)

index 0000000..5429083
--- /dev/null
+++ b/src/glsl/nir/nir_phi_builder.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir_phi_builder.h"
+#include "nir/nir_vla.h"
+
+struct nir_phi_builder {
+   nir_shader *shader;
+   nir_function_impl *impl;
+
+   /* Copied from the impl for easy access */
+   unsigned num_blocks;
+
+   /* Array of all blocks indexed by block->index. */
+   nir_block **blocks;
+
+   /* Hold on to the values so we can easily iterate over them. */
+   struct exec_list values;
+
+   /* Worklist for phi adding */
+   unsigned iter_count;
+   unsigned *work;
+   nir_block **W;
+};
+
+#define NEEDS_PHI ((nir_ssa_def *)(intptr_t)-1)
+
+struct nir_phi_builder_value {
+   struct exec_node node;
+
+   struct nir_phi_builder *builder;
+
+   /* Needed so we can create phis and undefs */
+   unsigned num_components;
+
+   /* The list of phi nodes associated with this value.  Phi nodes are not
+    * added directly.  Instead, they are created, the instr->block pointer
+    * set, and then added to this list.  Later, in phi_builder_finish, we
+    * set up their sources and add them to the top of their respective
+    * blocks.
+    */
+   struct exec_list phis;
+
+   /* Array of SSA defs, indexed by block.  If a phi needs to be inserted
+    * in a given block, it will have the magic value NEEDS_PHI.
+    */
+   nir_ssa_def *defs[0];
+};
+
+static bool
+fill_block_array(nir_block *block, void *void_data)
+{
+   nir_block **blocks = void_data;
+   blocks[block->index] = block;
+   return true;
+}
+
+struct nir_phi_builder *
+nir_phi_builder_create(nir_function_impl *impl)
+{
+   struct nir_phi_builder *pb = ralloc(NULL, struct nir_phi_builder);
+
+   pb->shader = impl->function->shader;
+   pb->impl = impl;
+
+   assert(impl->valid_metadata & (nir_metadata_block_index |
+                                  nir_metadata_dominance));
+
+   pb->num_blocks = impl->num_blocks;
+   pb->blocks = ralloc_array(pb, nir_block *, pb->num_blocks);
+   nir_foreach_block(impl, fill_block_array, pb->blocks);
+
+   exec_list_make_empty(&pb->values);
+
+   pb->iter_count = 0;
+   pb->work = rzalloc_array(pb, unsigned, pb->num_blocks);
+   pb->W = ralloc_array(pb, nir_block *, pb->num_blocks);
+
+   return pb;
+}
+
+struct nir_phi_builder_value *
+nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
+                          const BITSET_WORD *defs)
+{
+   struct nir_phi_builder_value *val;
+   unsigned i, w_start = 0, w_end = 0;
+
+   val = rzalloc_size(pb, sizeof(*val) + sizeof(val->defs[0]) * pb->num_blocks);
+   val->builder = pb;
+   val->num_components = num_components;
+   exec_list_make_empty(&val->phis);
+   exec_list_push_tail(&pb->values, &val->node);
+
+   pb->iter_count++;
+
+   BITSET_WORD tmp;
+   BITSET_FOREACH_SET(i, tmp, defs, pb->num_blocks) {
+      if (pb->work[i] < pb->iter_count)
+         pb->W[w_end++] = pb->blocks[i];
+      pb->work[i] = pb->iter_count;
+   }
+
+   while (w_start != w_end) {
+      nir_block *cur = pb->W[w_start++];
+      struct set_entry *dom_entry;
+      set_foreach(cur->dom_frontier, dom_entry) {
+         nir_block *next = (nir_block *) dom_entry->key;
+
+         /*
+          * If there's more than one return statement, then the end block
+          * can be a join point for some definitions. However, there are
+          * no instructions in the end block, so nothing would use those
+          * phi nodes. Of course, we couldn't place those phi nodes
+          * anyways due to the restriction of having no instructions in the
+          * end block...
+          */
+         if (next == pb->impl->end_block)
+            continue;
+
+         if (val->defs[next->index] == NULL) {
+            val->defs[next->index] = NEEDS_PHI;
+
+            if (pb->work[next->index] < pb->iter_count) {
+               pb->work[next->index] = pb->iter_count;
+               pb->W[w_end++] = next;
+            }
+         }
+      }
+   }
+
+   return val;
+}
+
+void
+nir_phi_builder_value_set_block_def(struct nir_phi_builder_value *val,
+                                    nir_block *block, nir_ssa_def *def)
+{
+   val->defs[block->index] = def;
+}
+
+nir_ssa_def *
+nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
+                                    nir_block *block)
+{
+   if (val->defs[block->index] == NULL) {
+      if (block->imm_dom) {
+         /* Grab it from our immediate dominator.  We'll stash it here for
+          * easy access later.
+          */
+         val->defs[block->index] =
+            nir_phi_builder_value_get_block_def(val, block->imm_dom);
+         return val->defs[block->index];
+      } else {
+         /* No immediate dominator means that this block is either the
+          * start block or unreachable.  In either case, the value is
+          * undefined so we need an SSA undef.
+          */
+         nir_ssa_undef_instr *undef =
+            nir_ssa_undef_instr_create(val->builder->shader,
+                                       val->num_components);
+         nir_instr_insert(nir_before_cf_list(&val->builder->impl->body),
+                          &undef->instr);
+         val->defs[block->index] = &undef->def;
+         return &undef->def;
+      }
+   } else if (val->defs[block->index] == NEEDS_PHI) {
+      /* If we need a phi instruction, go ahead and create one but don't
+       * add it to the program yet.  Later, we'll go through and set up phi
+       * sources and add the instructions will be added at that time.
+       */
+      nir_phi_instr *phi = nir_phi_instr_create(val->builder->shader);
+      nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components, NULL);
+      phi->instr.block = block;
+      exec_list_push_tail(&val->phis, &phi->instr.node);
+      val->defs[block->index] = &phi->dest.ssa;
+      return &phi->dest.ssa;
+   } else {
+      return val->defs[block->index];
+   }
+}
+
+static int
+compare_blocks(const void *_a, const void *_b)
+{
+   nir_block * const * a = _a;
+   nir_block * const * b = _b;
+
+   return (*a)->index - (*b)->index;
+}
+
+void
+nir_phi_builder_finish(struct nir_phi_builder *pb)
+{
+   const unsigned num_blocks = pb->num_blocks;
+   NIR_VLA(nir_block *, preds, num_blocks);
+
+   foreach_list_typed(struct nir_phi_builder_value, val, node, &pb->values) {
+      /* We can't iterate over the list of phis normally because we are
+       * removing them as we go and, in some cases, adding new phis as we
+       * build the source lists of others.
+       */
+      while (!exec_list_is_empty(&val->phis)) {
+         struct exec_node *head = exec_list_get_head(&val->phis);
+         nir_phi_instr *phi = exec_node_data(nir_phi_instr, head, instr.node);
+         assert(phi->instr.type == nir_instr_type_phi);
+
+         exec_node_remove(&phi->instr.node);
+
+         /* Construct an array of predecessors.  We sort it to ensure
+          * determinism in the phi insertion algorithm.
+          *
+          * XXX: Calling qsort this many times seems expensive.
+          */
+         int num_preds = 0;
+         struct set_entry *entry;
+         set_foreach(phi->instr.block->predecessors, entry)
+            preds[num_preds++] = (nir_block *)entry->key;
+         qsort(preds, num_preds, sizeof(*preds), compare_blocks);
+
+         for (unsigned i = 0; i < num_preds; i++) {
+            nir_phi_src *src = ralloc(phi, nir_phi_src);
+            src->pred = preds[i];
+            src->src = nir_src_for_ssa(
+               nir_phi_builder_value_get_block_def(val, preds[i]));
+            exec_list_push_tail(&phi->srcs, &src->node);
+         }
+
+         nir_instr_insert(nir_before_block(phi->instr.block), &phi->instr);
+      }
+   }
+
+   ralloc_free(pb);
+}
diff --git a/src/glsl/nir/nir_phi_builder.h b/src/glsl/nir/nir_phi_builder.h

new file mode 100644 (file)

index 0000000..50251bf
--- /dev/null
+++ b/src/glsl/nir/nir_phi_builder.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "nir.h"
+
+struct nir_phi_builder;
+struct nir_phi_builder_value;
+
+/* Create a new phi builder.
+ *
+ * While this is fairly cheap, it does allocate some memory and walk the list
+ * of blocks so it's recommended that you only call it once and use it to
+ * build phis for several values.
+ */
+struct nir_phi_builder *nir_phi_builder_create(nir_function_impl *impl);
+
+/* Register a value with the builder.
+ *
+ * The 'defs' parameter specifies a bitset of blocks in which the given value
+ * is defined.  This is used to determine where to place the phi nodes.
+ */
+struct nir_phi_builder_value *
+nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
+                          const BITSET_WORD *defs);
+
+/* Register a definition for the given value and block.
+ *
+ * It is safe to call this function as many times as you wish for any given
+ * block/value pair.  However, it always replaces whatever was there
+ * previously even if that definition is from a phi node.  The phi builder
+ * always uses the latest information it has, so you must be careful about the
+ * order in which you register definitions.  The final value at the end of the
+ * block must be the last value registered.
+ */
+void
+nir_phi_builder_value_set_block_def(struct nir_phi_builder_value *val,
+                                    nir_block *block, nir_ssa_def *def);
+
+/* Get the definition for the given value in the given block.
+ *
+ * This definition will always be the latest definition known for the given
+ * block.  If no definition is immediately available, it will crawl up the
+ * dominance tree and insert phi nodes as needed until it finds one.  In the
+ * case that no suitable definition is found, it will return the result of a
+ * nir_ssa_undef_instr with the correct number of components.
+ *
+ * Because this function only uses the latest available information for any
+ * given block, you must have already finished registering definitions for any
+ * blocks that dominate the current block in order to get the correct result.
+ */
+nir_ssa_def *
+nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
+                                    nir_block *block);
+
+/* Finish building phi nodes and free the builder.
+ *
+ * This function does far more than just free memory.  Prior to calling
+ * nir_phi_builder_finish, no phi nodes have actually been inserted in the
+ * program.  This function is what finishes setting up phi node sources and
+ * adds the phi nodes to the program.
+ */
+void nir_phi_builder_finish(struct nir_phi_builder *pb);
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c

index 1a4cc69..850774b 100644 (file)
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -219,6 +219,87 @@ print_alu_instr(nir_alu_instr *instr, print_state *state)
     }
  }
  
+static const char *
+get_var_name(nir_variable *var, print_state *state)
+{
+   if (state->ht == NULL)
+      return var->name;
+
+   assert(state->syms);
+
+   struct hash_entry *entry = _mesa_hash_table_search(state->ht, var);
+   if (entry)
+      return entry->data;
+
+   char *name;
+   if (var->name == NULL) {
+      name = ralloc_asprintf(state->syms, "@%u", state->index++);
+   } else {
+      struct set_entry *set_entry = _mesa_set_search(state->syms, var->name);
+      if (set_entry != NULL) {
+         /* we have a collision with another name, append an @ + a unique
+          * index */
+         name = ralloc_asprintf(state->syms, "%s@%u", var->name,
+                                state->index++);
+      } else {
+         /* Mark this one as seen */
+         _mesa_set_add(state->syms, var->name);
+         name = var->name;
+      }
+   }
+
+   _mesa_hash_table_insert(state->ht, var, name);
+
+   return name;
+}
+
+static void
+print_constant(nir_constant *c, const struct glsl_type *type, print_state *state)
+{
+   FILE *fp = state->fp;
+   unsigned total_elems = glsl_get_components(type);
+   unsigned i;
+
+   switch (glsl_get_base_type(type)) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_BOOL:
+      for (i = 0; i < total_elems; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "0x%08x", c->value.u[i]);
+      }
+      break;
+
+   case GLSL_TYPE_FLOAT:
+      for (i = 0; i < total_elems; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "%f", c->value.f[i]);
+      }
+      break;
+
+   case GLSL_TYPE_STRUCT:
+      for (i = 0; i < c->num_elements; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "{ ");
+         print_constant(c->elements[i], glsl_get_struct_field(type, i), state);
+         fprintf(fp, " }");
+      }
+      break;
+
+   case GLSL_TYPE_ARRAY:
+      for (i = 0; i < c->num_elements; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "{ ");
+         print_constant(c->elements[i], glsl_get_array_element(type), state);
+         fprintf(fp, " }");
+      }
+      break;
+
+   default:
+      unreachable("not reached");
+   }
+}
+
  static void
  print_var_decl(nir_variable *var, print_state *state)
  {
@@ -231,7 +312,8 @@ print_var_decl(nir_variable *var, print_state *state)
     const char *const patch = (var->data.patch) ? "patch " : "";
     const char *const inv = (var->data.invariant) ? "invariant " : "";
     const char *const mode[] = { "shader_in ", "shader_out ", "", "",
-                                "uniform ", "shader_storage", "system " };
+                                "uniform ", "shader_storage ", "shared ",
+                                "system "};
  
     fprintf(fp, "%s%s%s%s%s%s ",
        cent, samp, patch, inv, mode[var->data.mode],
@@ -239,20 +321,7 @@ print_var_decl(nir_variable *var, print_state *state)
  
     glsl_print_type(var->type, fp);
  
-   struct set_entry *entry = NULL;
-   if (state->syms)
-      entry = _mesa_set_search(state->syms, var->name);
-
-   char *name;
-
-   if (entry != NULL) {
-      /* we have a collision with another name, append an @ + a unique index */
-      name = ralloc_asprintf(state->syms, "%s@%u", var->name, state->index++);
-   } else {
-      name = var->name;
-   }
-
-   fprintf(fp, " %s", name);
+   fprintf(fp, " %s", get_var_name(var, state));
  
     if (var->data.mode == nir_var_shader_in ||
         var->data.mode == nir_var_shader_out ||
@@ -295,29 +364,20 @@ print_var_decl(nir_variable *var, print_state *state)
        fprintf(fp, " (%s, %u)", loc, var->data.driver_location);
     }
  
-   fprintf(fp, "\n");
-
-   if (state->syms) {
-      _mesa_set_add(state->syms, name);
-      _mesa_hash_table_insert(state->ht, var, name);
+   if (var->constant_initializer) {
+      fprintf(fp, " = { ");
+      print_constant(var->constant_initializer, var->type, state);
+      fprintf(fp, " }");
     }
+
+   fprintf(fp, "\n");
  }
  
  static void
  print_var(nir_variable *var, print_state *state)
  {
     FILE *fp = state->fp;
-   const char *name;
-   if (state->ht) {
-      struct hash_entry *entry = _mesa_hash_table_search(state->ht, var);
-
-      assert(entry != NULL);
-      name = entry->data;
-   } else {
-      name = var->name;
-   }
-
-   fprintf(fp, "%s", name);
+   fprintf(fp, "%s", get_var_name(var, state));
  }
  
  static void
@@ -547,6 +607,9 @@ print_tex_instr(nir_tex_instr *instr, print_state *state)
        case nir_tex_src_ddy:
           fprintf(fp, "(ddy)");
           break;
+      case nir_tex_src_texture_offset:
+         fprintf(fp, "(texture_offset)");
+         break;
        case nir_tex_src_sampler_offset:
           fprintf(fp, "(sampler_offset)");
           break;
@@ -577,13 +640,18 @@ print_tex_instr(nir_tex_instr *instr, print_state *state)
        fprintf(fp, "%u (gather_component), ", instr->component);
     }
  
+   if (instr->texture) {
+      assert(instr->sampler);
+      fprintf(fp, " (texture)");
+   }
     if (instr->sampler) {
        print_deref(instr->sampler, state);
+      fprintf(fp, " (sampler)");
     } else {
-      fprintf(fp, "%u", instr->sampler_index);
+      assert(instr->texture == NULL);
+      fprintf(fp, "%u (texture) %u (sampler)",
+              instr->texture_index, instr->sampler_index);
     }
-
-   fprintf(fp, " (sampler)");
  }
  
  static void
@@ -591,7 +659,7 @@ print_call_instr(nir_call_instr *instr, print_state *state)
  {
     FILE *fp = state->fp;
  
-   fprintf(fp, "call %s ", instr->callee->function->name);
+   fprintf(fp, "call %s ", instr->callee->name);
  
     for (unsigned i = 0; i < instr->num_params; i++) {
        if (i != 0)
@@ -857,7 +925,7 @@ print_function_impl(nir_function_impl *impl, print_state *state)
  {
     FILE *fp = state->fp;
  
-   fprintf(fp, "\nimpl %s ", impl->overload->function->name);
+   fprintf(fp, "\nimpl %s ", impl->function->name);
  
     for (unsigned i = 0; i < impl->num_params; i++) {
        if (i != 0)
@@ -895,18 +963,17 @@ print_function_impl(nir_function_impl *impl, print_state *state)
  }
  
  static void
-print_function_overload(nir_function_overload *overload,
-                        print_state *state)
+print_function(nir_function *function, print_state *state)
  {
     FILE *fp = state->fp;
  
-   fprintf(fp, "decl_overload %s ", overload->function->name);
+   fprintf(fp, "decl_function %s ", function->name);
  
-   for (unsigned i = 0; i < overload->num_params; i++) {
+   for (unsigned i = 0; i < function->num_params; i++) {
        if (i != 0)
           fprintf(fp, ", ");
  
-      switch (overload->params[i].param_type) {
+      switch (function->params[i].param_type) {
        case nir_parameter_in:
           fprintf(fp, "in ");
           break;
@@ -920,33 +987,25 @@ print_function_overload(nir_function_overload *overload,
           unreachable("Invalid parameter type");
        }
  
-      glsl_print_type(overload->params[i].type, fp);
+      glsl_print_type(function->params[i].type, fp);
     }
  
-   if (overload->return_type != NULL) {
-      if (overload->num_params != 0)
+   if (function->return_type != NULL) {
+      if (function->num_params != 0)
           fprintf(fp, ", ");
        fprintf(fp, "returning ");
-      glsl_print_type(overload->return_type, fp);
+      glsl_print_type(function->return_type, fp);
     }
  
     fprintf(fp, "\n");
  
-   if (overload->impl != NULL) {
-      print_function_impl(overload->impl, state);
+   if (function->impl != NULL) {
+      print_function_impl(function->impl, state);
        return;
     }
  }
  
  static void
-print_function(nir_function *func, print_state *state)
-{
-   foreach_list_typed(nir_function_overload, overload, node, &func->overload_list) {
-      print_function_overload(overload, state);
-   }
-}
-
-static void
  init_print_state(print_state *state, nir_shader *shader, FILE *fp)
  {
     state->fp = fp;
@@ -982,6 +1041,7 @@ nir_print_shader(nir_shader *shader, FILE *fp)
     fprintf(fp, "inputs: %u\n", shader->num_inputs);
     fprintf(fp, "outputs: %u\n", shader->num_outputs);
     fprintf(fp, "uniforms: %u\n", shader->num_uniforms);
+   fprintf(fp, "shared: %u\n", shader->num_shared);
  
     nir_foreach_variable(var, &shader->uniforms) {
        print_var_decl(var, &state);
@@ -995,6 +1055,10 @@ nir_print_shader(nir_shader *shader, FILE *fp)
        print_var_decl(var, &state);
     }
  
+   nir_foreach_variable(var, &shader->shared) {
+      print_var_decl(var, &state);
+   }
+
     nir_foreach_variable(var, &shader->globals) {
        print_var_decl(var, &state);
     }
diff --git a/src/glsl/nir/nir_remove_dead_variables.c b/src/glsl/nir/nir_remove_dead_variables.c

index 8f0833c..792c5d4 100644 (file)
--- a/src/glsl/nir/nir_remove_dead_variables.c
+++ b/src/glsl/nir/nir_remove_dead_variables.c
@@ -90,9 +90,9 @@ add_var_use_block(nir_block *block, void *state)
  static void
  add_var_use_shader(nir_shader *shader, struct set *live)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
-         nir_foreach_block(overload->impl, add_var_use_block, live);
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         nir_foreach_block(function->impl, add_var_use_block, live);
        }
     }
  }
@@ -115,7 +115,7 @@ remove_dead_vars(struct exec_list *var_list, struct set *live)
  }
  
  bool
-nir_remove_dead_variables(nir_shader *shader)
+nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode)
  {
     bool progress = false;
     struct set *live =
@@ -123,15 +123,30 @@ nir_remove_dead_variables(nir_shader *shader)
  
     add_var_use_shader(shader, live);
  
-   progress = remove_dead_vars(&shader->globals, live) || progress;
+   if (mode == nir_var_uniform || mode == nir_var_all)
+      progress = remove_dead_vars(&shader->uniforms, live) || progress;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl) {
-         if (remove_dead_vars(&overload->impl->locals, live)) {
-            nir_metadata_preserve(overload->impl, nir_metadata_block_index |
-                                                  nir_metadata_dominance |
-                                                  nir_metadata_live_ssa_defs);
-            progress = true;
+   if (mode == nir_var_shader_in || mode == nir_var_all)
+      progress = remove_dead_vars(&shader->inputs, live) || progress;
+
+   if (mode == nir_var_shader_out || mode == nir_var_all)
+      progress = remove_dead_vars(&shader->outputs, live) || progress;
+
+   if (mode == nir_var_global || mode == nir_var_all)
+      progress = remove_dead_vars(&shader->globals, live) || progress;
+
+   if (mode == nir_var_system_value || mode == nir_var_all)
+      progress = remove_dead_vars(&shader->system_values, live) || progress;
+
+   if (mode == nir_var_local || mode == nir_var_all) {
+      nir_foreach_function(shader, function) {
+         if (function->impl) {
+            if (remove_dead_vars(&function->impl->locals, live)) {
+               nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                                     nir_metadata_dominance |
+                                                     nir_metadata_live_ssa_defs);
+               progress = true;
+            }
           }
        }
     }
diff --git a/src/glsl/nir/nir_repair_ssa.c b/src/glsl/nir/nir_repair_ssa.c

new file mode 100644 (file)

index 0000000..3ab4f0f
--- /dev/null
+++ b/src/glsl/nir/nir_repair_ssa.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_phi_builder.h"
+
+struct repair_ssa_state {
+   nir_function_impl *impl;
+
+   BITSET_WORD *def_set;
+   struct nir_phi_builder *phi_builder;
+
+   bool progress;
+};
+
+/* Get ready to build a phi and return the builder */
+static struct nir_phi_builder *
+prep_build_phi(struct repair_ssa_state *state)
+{
+   const unsigned num_words = BITSET_WORDS(state->impl->num_blocks);
+
+   /* We create the phi builder on-demand. */
+   if (state->phi_builder == NULL) {
+      state->phi_builder = nir_phi_builder_create(state->impl);
+      state->def_set = ralloc_array(NULL, BITSET_WORD, num_words);
+   }
+
+   /* We're going to build a phi.  That's progress. */
+   state->progress = true;
+
+   /* Set the defs set to empty */
+   memset(state->def_set, 0, num_words * sizeof(*state->def_set));
+
+   return state->phi_builder;
+}
+
+static nir_block *
+get_src_block(nir_src *src)
+{
+   if (src->parent_instr->type == nir_instr_type_phi) {
+      return exec_node_data(nir_phi_src, src, src)->pred;
+   } else {
+      return src->parent_instr->block;
+   }
+}
+
+static bool
+repair_ssa_def(nir_ssa_def *def, void *void_state)
+{
+   struct repair_ssa_state *state = void_state;
+
+   bool is_valid = true;
+   nir_foreach_use(def, src) {
+      if (!nir_block_dominates(def->parent_instr->block, get_src_block(src))) {
+         is_valid = false;
+         break;
+      }
+   }
+
+   if (is_valid)
+      return true;
+
+   struct nir_phi_builder *pb = prep_build_phi(state);
+
+   BITSET_SET(state->def_set, def->parent_instr->block->index);
+
+   struct nir_phi_builder_value *val =
+      nir_phi_builder_add_value(pb, def->num_components, state->def_set);
+
+   nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def);
+
+   nir_foreach_use_safe(def, src) {
+      nir_block *src_block = get_src_block(src);
+      if (!nir_block_dominates(def->parent_instr->block, src_block)) {
+         nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(
+            nir_phi_builder_value_get_block_def(val, src_block)));
+      }
+   }
+
+   return true;
+}
+
+static bool
+repair_ssa_block(nir_block *block, void *state)
+{
+   nir_foreach_instr_safe(block, instr) {
+      nir_foreach_ssa_def(instr, repair_ssa_def, state);
+   }
+
+   return true;
+}
+
+bool
+nir_repair_ssa_impl(nir_function_impl *impl)
+{
+   struct repair_ssa_state state;
+
+   state.impl = impl;
+   state.phi_builder = NULL;
+   state.progress = false;
+
+   nir_metadata_require(impl, nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+   nir_foreach_block(impl, repair_ssa_block, &state);
+
+   if (state.progress)
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+
+   if (state.phi_builder) {
+      nir_phi_builder_finish(state.phi_builder);
+      ralloc_free(state.def_set);
+   }
+
+   return state.progress;
+}
+
+/** This pass can be used to repair SSA form in a shader.
+ *
+ * Sometimes a transformation (such as return lowering) will have to make
+ * changes to a shader which, while still correct, break some of NIR's SSA
+ * invariants.  This pass will insert ssa_undefs and phi nodes as needed to
+ * get the shader back into SSA that the validator will like.
+ */
+bool
+nir_repair_ssa(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = nir_repair_ssa_impl(function->impl) || progress;
+   }
+
+   return progress;
+}
diff --git a/src/glsl/nir/nir_split_var_copies.c b/src/glsl/nir/nir_split_var_copies.c

index bfbef72..6fdaefa 100644 (file)
--- a/src/glsl/nir/nir_split_var_copies.c
+++ b/src/glsl/nir/nir_split_var_copies.c
@@ -276,9 +276,9 @@ nir_split_var_copies(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress = split_var_copies_impl(overload->impl) || progress;
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = split_var_copies_impl(function->impl) || progress;
     }
  
     return progress;
diff --git a/src/glsl/nir/nir_sweep.c b/src/glsl/nir/nir_sweep.c

index 5a22f50..5c62154 100644 (file)
--- a/src/glsl/nir/nir_sweep.c
+++ b/src/glsl/nir/nir_sweep.c
@@ -137,13 +137,10 @@ static void
  sweep_function(nir_shader *nir, nir_function *f)
  {
     ralloc_steal(nir, f);
+   ralloc_steal(nir, f->params);
  
-   foreach_list_typed(nir_function_overload, overload, node, &f->overload_list) {
-      ralloc_steal(nir, overload);
-      ralloc_steal(nir, overload->params);
-      if (overload->impl)
-         sweep_impl(nir, overload->impl);
-   }
+   if (f->impl)
+      sweep_impl(nir, f->impl);
  }
  
  void
@@ -162,6 +159,7 @@ nir_sweep(nir_shader *nir)
     steal_list(nir, nir_variable, &nir->uniforms);
     steal_list(nir, nir_variable, &nir->inputs);
     steal_list(nir, nir_variable, &nir->outputs);
+   steal_list(nir, nir_variable, &nir->shared);
     steal_list(nir, nir_variable, &nir->globals);
     steal_list(nir, nir_variable, &nir->system_values);
     steal_list(nir, nir_register, &nir->registers);
diff --git a/src/glsl/nir/nir_to_ssa.c b/src/glsl/nir/nir_to_ssa.c

index b089df7..44a5054 100644 (file)
--- a/src/glsl/nir/nir_to_ssa.c
+++ b/src/glsl/nir/nir_to_ssa.c
@@ -529,8 +529,8 @@ nir_convert_to_ssa_impl(nir_function_impl *impl)
  void
  nir_convert_to_ssa(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_convert_to_ssa_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_convert_to_ssa_impl(function->impl);
     }
  }
diff --git a/src/glsl/nir/nir_types.cpp b/src/glsl/nir/nir_types.cpp

index 135591a..f4408de 100644 (file)
--- a/src/glsl/nir/nir_types.cpp
+++ b/src/glsl/nir/nir_types.cpp
@@ -70,6 +70,18 @@ glsl_get_struct_field(const glsl_type *type, unsigned index)
     return type->fields.structure[index].type;
  }
  
+const glsl_type *
+glsl_get_function_return_type(const glsl_type *type)
+{
+   return type->fields.parameters[0].type;
+}
+
+const glsl_function_param *
+glsl_get_function_param(const glsl_type *type, unsigned index)
+{
+   return &type->fields.parameters[index + 1];
+}
+
  const struct glsl_type *
  glsl_get_column_type(const struct glsl_type *type)
  {
@@ -112,12 +124,33 @@ glsl_get_aoa_size(const struct glsl_type *type)
     return type->arrays_of_arrays_size();
  }
  
+unsigned
+glsl_count_attribute_slots(const struct glsl_type *type,
+                           bool vertex_input_slots)
+{
+   return type->count_attribute_slots(vertex_input_slots);
+}
+
  const char *
  glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index)
  {
     return type->fields.structure[index].name;
  }
  
+glsl_sampler_dim
+glsl_get_sampler_dim(const struct glsl_type *type)
+{
+   assert(glsl_type_is_sampler(type) || glsl_type_is_image(type));
+   return (glsl_sampler_dim)type->sampler_dimensionality;
+}
+
+glsl_base_type
+glsl_get_sampler_result_type(const struct glsl_type *type)
+{
+   assert(glsl_type_is_sampler(type) || glsl_type_is_image(type));
+   return (glsl_base_type)type->sampler_type;
+}
+
  unsigned
  glsl_get_record_location_offset(const struct glsl_type *type,
                                  unsigned length)
@@ -132,6 +165,12 @@ glsl_type_is_void(const glsl_type *type)
  }
  
  bool
+glsl_type_is_error(const glsl_type *type)
+{
+   return type->is_error();
+}
+
+bool
  glsl_type_is_vector(const struct glsl_type *type)
  {
     return type->is_vector();
@@ -155,6 +194,44 @@ glsl_type_is_matrix(const struct glsl_type *type)
     return type->is_matrix();
  }
  
+bool
+glsl_type_is_array(const struct glsl_type *type)
+{
+   return type->is_array();
+}
+
+bool
+glsl_type_is_struct(const struct glsl_type *type)
+{
+   return type->is_record() || type->is_interface();
+}
+
+bool
+glsl_type_is_sampler(const struct glsl_type *type)
+{
+   return type->is_sampler();
+}
+
+bool
+glsl_type_is_image(const struct glsl_type *type)
+{
+   return type->is_image();
+}
+
+bool
+glsl_sampler_type_is_shadow(const struct glsl_type *type)
+{
+   assert(glsl_type_is_sampler(type));
+   return type->sampler_shadow;
+}
+
+bool
+glsl_sampler_type_is_array(const struct glsl_type *type)
+{
+   assert(glsl_type_is_sampler(type) || glsl_type_is_image(type));
+   return type->sampler_array;
+}
+
  const glsl_type *
  glsl_void_type(void)
  {
@@ -168,19 +245,92 @@ glsl_float_type(void)
  }
  
  const glsl_type *
+glsl_vec_type(unsigned n)
+{
+   return glsl_type::vec(n);
+}
+
+const glsl_type *
  glsl_vec4_type(void)
  {
     return glsl_type::vec4_type;
  }
  
  const glsl_type *
+glsl_int_type(void)
+{
+   return glsl_type::int_type;
+}
+
+const glsl_type *
  glsl_uint_type(void)
  {
     return glsl_type::uint_type;
  }
  
  const glsl_type *
+glsl_bool_type(void)
+{
+   return glsl_type::bool_type;
+}
+
+const glsl_type *
+glsl_scalar_type(enum glsl_base_type base_type)
+{
+   return glsl_type::get_instance(base_type, 1, 1);
+}
+
+const glsl_type *
+glsl_vector_type(enum glsl_base_type base_type, unsigned components)
+{
+   assert(components > 1 && components <= 4);
+   return glsl_type::get_instance(base_type, components, 1);
+}
+
+const glsl_type *
+glsl_matrix_type(enum glsl_base_type base_type, unsigned rows, unsigned columns)
+{
+   assert(rows > 1 && rows <= 4 && columns >= 1 && columns <= 4);
+   return glsl_type::get_instance(base_type, rows, columns);
+}
+
+const glsl_type *
  glsl_array_type(const glsl_type *base, unsigned elements)
  {
     return glsl_type::get_array_instance(base, elements);
  }
+
+const glsl_type *
+glsl_struct_type(const glsl_struct_field *fields,
+                 unsigned num_fields, const char *name)
+{
+   return glsl_type::get_record_instance(fields, num_fields, name);
+}
+
+const struct glsl_type *
+glsl_sampler_type(enum glsl_sampler_dim dim, bool is_shadow, bool is_array,
+                  enum glsl_base_type base_type)
+{
+   return glsl_type::get_sampler_instance(dim, is_shadow, is_array, base_type);
+}
+
+const struct glsl_type *
+glsl_image_type(enum glsl_sampler_dim dim, bool is_array,
+                enum glsl_base_type base_type)
+{
+   return glsl_type::get_image_instance(dim, is_array, base_type);
+}
+
+const glsl_type *
+glsl_function_type(const glsl_type *return_type,
+                   const glsl_function_param *params, unsigned num_params)
+{
+   return glsl_type::get_function_instance(return_type, params, num_params);
+}
+
+const glsl_type *
+glsl_transposed_type(const struct glsl_type *type)
+{
+   return glsl_type::get_instance(type->base_type, type->matrix_columns,
+                                  type->vector_elements);
+}
diff --git a/src/glsl/nir/nir_types.h b/src/glsl/nir/nir_types.h

index b0b5184..8cb7ec1 100644 (file)
--- a/src/glsl/nir/nir_types.h
+++ b/src/glsl/nir/nir_types.h
@@ -49,6 +49,12 @@ const struct glsl_type *glsl_get_array_element(const struct glsl_type *type);
  
  const struct glsl_type *glsl_get_column_type(const struct glsl_type *type);
  
+const struct glsl_type *
+glsl_get_function_return_type(const struct glsl_type *type);
+
+const struct glsl_function_param *
+glsl_get_function_param(const struct glsl_type *type, unsigned index);
+
  enum glsl_base_type glsl_get_base_type(const struct glsl_type *type);
  
  unsigned glsl_get_vector_elements(const struct glsl_type *type);
@@ -61,24 +67,59 @@ unsigned glsl_get_length(const struct glsl_type *type);
  
  unsigned glsl_get_aoa_size(const struct glsl_type *type);
  
+unsigned glsl_count_attribute_slots(const struct glsl_type *type,
+                                    bool vertex_input_slots);
+
  const char *glsl_get_struct_elem_name(const struct glsl_type *type,
                                        unsigned index);
  
+enum glsl_sampler_dim glsl_get_sampler_dim(const struct glsl_type *type);
+enum glsl_base_type glsl_get_sampler_result_type(const struct glsl_type *type);
+
  unsigned glsl_get_record_location_offset(const struct glsl_type *type,
                                           unsigned length);
  
  bool glsl_type_is_void(const struct glsl_type *type);
+bool glsl_type_is_error(const struct glsl_type *type);
  bool glsl_type_is_vector(const struct glsl_type *type);
  bool glsl_type_is_scalar(const struct glsl_type *type);
  bool glsl_type_is_vector_or_scalar(const struct glsl_type *type);
  bool glsl_type_is_matrix(const struct glsl_type *type);
+bool glsl_type_is_array(const struct glsl_type *type);
+bool glsl_type_is_struct(const struct glsl_type *type);
+bool glsl_type_is_sampler(const struct glsl_type *type);
+bool glsl_type_is_image(const struct glsl_type *type);
+bool glsl_sampler_type_is_shadow(const struct glsl_type *type);
+bool glsl_sampler_type_is_array(const struct glsl_type *type);
  
  const struct glsl_type *glsl_void_type(void);
  const struct glsl_type *glsl_float_type(void);
+const struct glsl_type *glsl_vec_type(unsigned n);
  const struct glsl_type *glsl_vec4_type(void);
+const struct glsl_type *glsl_int_type(void);
  const struct glsl_type *glsl_uint_type(void);
+const struct glsl_type *glsl_bool_type(void);
+
+const struct glsl_type *glsl_scalar_type(enum glsl_base_type base_type);
+const struct glsl_type *glsl_vector_type(enum glsl_base_type base_type,
+                                         unsigned components);
+const struct glsl_type *glsl_matrix_type(enum glsl_base_type base_type,
+                                         unsigned rows, unsigned columns);
  const struct glsl_type *glsl_array_type(const struct glsl_type *base,
                                          unsigned elements);
+const struct glsl_type *glsl_struct_type(const struct glsl_struct_field *fields,
+                                         unsigned num_fields, const char *name);
+const struct glsl_type *glsl_sampler_type(enum glsl_sampler_dim dim,
+                                          bool is_shadow, bool is_array,
+                                          enum glsl_base_type base_type);
+const struct glsl_type *glsl_image_type(enum glsl_sampler_dim dim,
+                                        bool is_array,
+                                        enum glsl_base_type base_type);
+const struct glsl_type * glsl_function_type(const struct glsl_type *return_type,
+                                            const struct glsl_function_param *params,
+                                            unsigned num_params);
+
+const struct glsl_type *glsl_transposed_type(const struct glsl_type *type);
  
  #ifdef __cplusplus
  }
diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c

index 06879d6..1a943d7 100644 (file)
--- a/src/glsl/nir/nir_validate.c
+++ b/src/glsl/nir/nir_validate.c
@@ -417,6 +417,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
        assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
               instr->variables[0]->var->data.mode != nir_var_uniform &&
               instr->variables[0]->var->data.mode != nir_var_shader_storage);
+      assert((instr->const_index[0] & ~((1 << instr->num_components) - 1)) == 0);
        break;
     }
     case nir_intrinsic_copy_var:
@@ -453,10 +454,12 @@ validate_tex_instr(nir_tex_instr *instr, validate_state *state)
  static void
  validate_call_instr(nir_call_instr *instr, validate_state *state)
  {
-   if (instr->return_deref == NULL)
+   if (instr->return_deref == NULL) {
        assert(glsl_type_is_void(instr->callee->return_type));
-   else
+   } else {
        assert(instr->return_deref->deref.type == instr->callee->return_type);
+      validate_deref_var(instr, instr->return_deref, state);
+   }
  
     assert(instr->num_params == instr->callee->num_params);
  
@@ -464,8 +467,6 @@ validate_call_instr(nir_call_instr *instr, validate_state *state)
        assert(instr->callee->params[i].type == instr->params[i]->deref.type);
        validate_deref_var(instr, instr->params[i], state);
     }
-
-   validate_deref_var(instr, instr->return_deref, state);
  }
  
  static void
@@ -928,17 +929,17 @@ postvalidate_ssa_defs_block(nir_block *block, void *state)
  static void
  validate_function_impl(nir_function_impl *impl, validate_state *state)
  {
-   assert(impl->overload->impl == impl);
+   assert(impl->function->impl == impl);
     assert(impl->cf_node.parent == NULL);
  
-   assert(impl->num_params == impl->overload->num_params);
+   assert(impl->num_params == impl->function->num_params);
     for (unsigned i = 0; i < impl->num_params; i++)
-      assert(impl->params[i]->type == impl->overload->params[i].type);
+      assert(impl->params[i]->type == impl->function->params[i].type);
  
-   if (glsl_type_is_void(impl->overload->return_type))
+   if (glsl_type_is_void(impl->function->return_type))
        assert(impl->return_var == NULL);
     else
-      assert(impl->return_var->type == impl->overload->return_type);
+      assert(impl->return_var->type == impl->function->return_type);
  
     assert(exec_list_is_empty(&impl->end_block->instr_list));
     assert(impl->end_block->successors[0] == NULL);
@@ -980,20 +981,11 @@ validate_function_impl(nir_function_impl *impl, validate_state *state)
  }
  
  static void
-validate_function_overload(nir_function_overload *overload,
-                           validate_state *state)
-{
-   if (overload->impl != NULL)
-      validate_function_impl(overload->impl, state);
-}
-
-static void
  validate_function(nir_function *func, validate_state *state)
  {
-   exec_list_validate(&func->overload_list);
-   foreach_list_typed(nir_function_overload, overload, node, &func->overload_list) {
-      assert(overload->function == func);
-      validate_function_overload(overload, state);
+   if (func->impl != NULL) {
+      assert(func->impl->function == func);
+      validate_function_impl(func->impl, state);
     }
  }
  
@@ -1044,6 +1036,11 @@ nir_validate_shader(nir_shader *shader)
       validate_var_decl(var, true, &state);
     }
  
+   exec_list_validate(&shader->shared);
+   nir_foreach_variable(var, &shader->shared) {
+      validate_var_decl(var, true, &state);
+   }
+
     exec_list_validate(&shader->globals);
     nir_foreach_variable(var, &shader->globals) {
       validate_var_decl(var, true, &state);
diff --git a/src/glsl/nir/shader_enums.c b/src/glsl/nir/shader_enums.c

index 66a25e7..1410a50 100644 (file)
--- a/src/glsl/nir/shader_enums.c
+++ b/src/glsl/nir/shader_enums.c
@@ -47,6 +47,42 @@ const char * gl_shader_stage_name(gl_shader_stage stage)
     return NAME(stage);
  }
  
+/**
+ * Translate a gl_shader_stage to a short shader stage name for debug
+ * printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_string(unsigned stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:   return "vertex";
+   case MESA_SHADER_FRAGMENT: return "fragment";
+   case MESA_SHADER_GEOMETRY: return "geometry";
+   case MESA_SHADER_COMPUTE:  return "compute";
+   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
+   case MESA_SHADER_TESS_EVAL: return "tess eval";
+   }
+
+   unreachable("Unknown shader stage.");
+}
+
+/**
+ * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
+ * for debug printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_abbrev(unsigned stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:   return "VS";
+   case MESA_SHADER_FRAGMENT: return "FS";
+   case MESA_SHADER_GEOMETRY: return "GS";
+   case MESA_SHADER_COMPUTE:  return "CS";
+   case MESA_SHADER_TESS_CTRL: return "TCS";
+   case MESA_SHADER_TESS_EVAL: return "TES";
+   }
+
+   unreachable("Unknown shader stage.");
+}
+
  const char * gl_vert_attrib_name(gl_vert_attrib attrib)
  {
     static const char *names[] = {
@@ -172,6 +208,8 @@ const char * gl_system_value_name(gl_system_value sysval)
       ENUM(SYSTEM_VALUE_TESS_LEVEL_OUTER),
       ENUM(SYSTEM_VALUE_TESS_LEVEL_INNER),
       ENUM(SYSTEM_VALUE_LOCAL_INVOCATION_ID),
+     ENUM(SYSTEM_VALUE_LOCAL_INVOCATION_INDEX),
+     ENUM(SYSTEM_VALUE_GLOBAL_INVOCATION_ID),
       ENUM(SYSTEM_VALUE_WORK_GROUP_ID),
       ENUM(SYSTEM_VALUE_NUM_WORK_GROUPS),
       ENUM(SYSTEM_VALUE_VERTEX_CNT),
diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h

index dd0e0ba..bc6ea38 100644 (file)
--- a/src/glsl/nir/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@@ -26,6 +26,10 @@
  #ifndef SHADER_ENUMS_H
  #define SHADER_ENUMS_H
  
+#ifdef __cplusplus
+extern "C" {
+#endif
+
  /**
   * Shader stages. Note that these will become 5 with tessellation.
   *
@@ -45,6 +49,18 @@ typedef enum
  
  const char * gl_shader_stage_name(gl_shader_stage stage);
  
+/**
+ * Translate a gl_shader_stage to a short shader stage name for debug
+ * printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_string(unsigned stage);
+
+/**
+ * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
+ * for debug printouts and error messages.
+ */
+const char * _mesa_shader_stage_to_abbrev(unsigned stage);
+
  #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
  
  
@@ -379,6 +395,26 @@ typedef enum
      * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
      */
     SYSTEM_VALUE_BASE_VERTEX,
+
+   /**
+    * Value of \c baseinstance passed to instanced draw entry points
+    *
+    * \sa SYSTEM_VALUE_INSTANCE_ID
+    */
+   SYSTEM_VALUE_BASE_INSTANCE,
+
+   /**
+    * From _ARB_shader_draw_parameters:
+    *
+    *   "Additionally, this extension adds a further built-in variable,
+    *    gl_DrawID to the shading language. This variable contains the index
+    *    of the draw currently being processed by a Multi* variant of a
+    *    drawing command (such as MultiDrawElements or
+    *    MultiDrawArraysIndirect)."
+    *
+    * If GL_ARB_multi_draw_indirect is not supported, this is always 0.
+    */
+   SYSTEM_VALUE_DRAW_ID,
     /*@}*/
  
     /**
@@ -392,7 +428,8 @@ typedef enum
      * \name Fragment shader system values
      */
     /*@{*/
-   SYSTEM_VALUE_FRONT_FACE,     /**< (not done yet) */
+   SYSTEM_VALUE_FRAG_COORD,
+   SYSTEM_VALUE_FRONT_FACE,
     SYSTEM_VALUE_SAMPLE_ID,
     SYSTEM_VALUE_SAMPLE_POS,
     SYSTEM_VALUE_SAMPLE_MASK_IN,
@@ -415,6 +452,8 @@ typedef enum
      */
     /*@{*/
     SYSTEM_VALUE_LOCAL_INVOCATION_ID,
+   SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
+   SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
     SYSTEM_VALUE_WORK_GROUP_ID,
     SYSTEM_VALUE_NUM_WORK_GROUPS,
     /*@}*/
@@ -498,4 +537,8 @@ enum gl_frag_depth_layout
     FRAG_DEPTH_LAYOUT_UNCHANGED
  };
  
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
  #endif /* SHADER_ENUMS_H */
diff --git a/src/glsl/nir/spirv/GLSL.std.450.h b/src/glsl/nir/spirv/GLSL.std.450.h

new file mode 100644 (file)

index 0000000..d1c9b5c
--- /dev/null
+++ b/src/glsl/nir/spirv/GLSL.std.450.h
@@ -0,0 +1,127 @@
+/*
+** Copyright (c) 2014-2015 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and/or associated documentation files (the "Materials"),
+** to deal in the Materials without restriction, including without limitation
+** the rights to use, copy, modify, merge, publish, distribute, sublicense,
+** and/or sell copies of the Materials, and to permit persons to whom the
+** Materials are furnished to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in
+** all copies or substantial portions of the Materials.
+**
+** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
+** STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
+** HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ 
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+** THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+** FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
+** IN THE MATERIALS.
+*/
+
+#ifndef GLSLstd450_H
+#define GLSLstd450_H
+
+const int GLSLstd450Version = 99;
+const int GLSLstd450Revision = 3;
+
+enum GLSLstd450 {
+    GLSLstd450Bad = 0,              // Don't use
+
+    GLSLstd450Round = 1,
+    GLSLstd450RoundEven = 2,
+    GLSLstd450Trunc = 3,
+    GLSLstd450FAbs = 4,
+    GLSLstd450SAbs = 5,
+    GLSLstd450FSign = 6,
+    GLSLstd450SSign = 7,
+    GLSLstd450Floor = 8,
+    GLSLstd450Ceil = 9,
+    GLSLstd450Fract = 10,
+
+    GLSLstd450Radians = 11,
+    GLSLstd450Degrees = 12,
+    GLSLstd450Sin = 13,
+    GLSLstd450Cos = 14,
+    GLSLstd450Tan = 15,
+    GLSLstd450Asin = 16,
+    GLSLstd450Acos = 17,
+    GLSLstd450Atan = 18,
+    GLSLstd450Sinh = 19,
+    GLSLstd450Cosh = 20,
+    GLSLstd450Tanh = 21,
+    GLSLstd450Asinh = 22,
+    GLSLstd450Acosh = 23,
+    GLSLstd450Atanh = 24,
+    GLSLstd450Atan2 = 25,
+
+    GLSLstd450Pow = 26,
+    GLSLstd450Exp = 27,
+    GLSLstd450Log = 28,
+    GLSLstd450Exp2 = 29,
+    GLSLstd450Log2 = 30,
+    GLSLstd450Sqrt = 31,
+    GLSLstd450InverseSqrt = 32,
+
+    GLSLstd450Determinant = 33,
+    GLSLstd450MatrixInverse = 34,
+
+    GLSLstd450Modf = 35,            // second operand needs an OpVariable to write to
+    GLSLstd450ModfStruct = 36,      // no OpVariable operand
+    GLSLstd450FMin = 37,
+    GLSLstd450UMin = 38,
+    GLSLstd450SMin = 39,
+    GLSLstd450FMax = 40,
+    GLSLstd450UMax = 41,
+    GLSLstd450SMax = 42,
+    GLSLstd450FClamp = 43,
+    GLSLstd450UClamp = 44,
+    GLSLstd450SClamp = 45,
+    GLSLstd450FMix = 46,
+    GLSLstd450IMix = 47,
+    GLSLstd450Step = 48,
+    GLSLstd450SmoothStep = 49,
+
+    GLSLstd450Fma = 50,
+    GLSLstd450Frexp = 51,            // second operand needs an OpVariable to write to
+    GLSLstd450FrexpStruct = 52,      // no OpVariable operand
+    GLSLstd450Ldexp = 53,
+
+    GLSLstd450PackSnorm4x8 = 54,
+    GLSLstd450PackUnorm4x8 = 55,
+    GLSLstd450PackSnorm2x16 = 56,
+    GLSLstd450PackUnorm2x16 = 57,
+    GLSLstd450PackHalf2x16 = 58,
+    GLSLstd450PackDouble2x32 = 59,
+    GLSLstd450UnpackSnorm2x16 = 60,
+    GLSLstd450UnpackUnorm2x16 = 61,
+    GLSLstd450UnpackHalf2x16 = 62,
+    GLSLstd450UnpackSnorm4x8 = 63,
+    GLSLstd450UnpackUnorm4x8 = 64,
+    GLSLstd450UnpackDouble2x32 = 65,
+
+    GLSLstd450Length = 66,
+    GLSLstd450Distance = 67,
+    GLSLstd450Cross = 68,
+    GLSLstd450Normalize = 69,
+    GLSLstd450FaceForward = 70,
+    GLSLstd450Reflect = 71,
+    GLSLstd450Refract = 72,
+
+    GLSLstd450FindILsb = 73,
+    GLSLstd450FindSMsb = 74,
+    GLSLstd450FindUMsb = 75,
+
+    GLSLstd450InterpolateAtCentroid = 76,
+    GLSLstd450InterpolateAtSample = 77,
+    GLSLstd450InterpolateAtOffset = 78,
+
+    GLSLstd450Count
+};
+
+#endif  // #ifndef GLSLstd450_H
diff --git a/src/glsl/nir/spirv/nir_spirv.h b/src/glsl/nir/spirv/nir_spirv.h

new file mode 100644 (file)

index 0000000..9c9c93d
--- /dev/null
+++ b/src/glsl/nir/spirv/nir_spirv.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand (jason@jlekstrand.net)
+ *
+ */
+
+#pragma once
+
+#ifndef _NIR_SPIRV_H_
+#define _NIR_SPIRV_H_
+
+#include "nir.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct nir_spirv_specialization {
+   uint32_t id;
+   uint32_t data;
+};
+
+nir_function *spirv_to_nir(const uint32_t *words, size_t word_count,
+                           struct nir_spirv_specialization *specializations,
+                           unsigned num_specializations,
+                           gl_shader_stage stage, const char *entry_point_name,
+                           const nir_shader_compiler_options *options);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NIR_SPIRV_H_ */
diff --git a/src/glsl/nir/spirv/spirv.h b/src/glsl/nir/spirv/spirv.h

new file mode 100644 (file)

index 0000000..63bcb2f
--- /dev/null
+++ b/src/glsl/nir/spirv/spirv.h
@@ -0,0 +1,870 @@
+/*
+** Copyright (c) 2014-2015 The Khronos Group Inc.
+** 
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and/or associated documentation files (the "Materials"),
+** to deal in the Materials without restriction, including without limitation
+** the rights to use, copy, modify, merge, publish, distribute, sublicense,
+** and/or sell copies of the Materials, and to permit persons to whom the
+** Materials are furnished to do so, subject to the following conditions:
+** 
+** The above copyright notice and this permission notice shall be included in
+** all copies or substantial portions of the Materials.
+** 
+** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
+** STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
+** HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ 
+** 
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+** THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+** FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
+** IN THE MATERIALS.
+*/
+
+/*
+** This header is automatically generated by the same tool that creates
+** the Binary Section of the SPIR-V specification.
+*/
+
+/*
+** Enumeration tokens for SPIR-V, in various styles:
+**   C, C++, C++11, JSON, Lua, Python
+** 
+** - C will have tokens with a "Spv" prefix, e.g.: SpvSourceLanguageGLSL
+** - C++ will have tokens in the "spv" name space, e.g.: spv::SourceLanguageGLSL
+** - C++11 will use enum classes in the spv namespace, e.g.: spv::SourceLanguage::GLSL
+** - Lua will use tables, e.g.: spv.SourceLanguage.GLSL
+** - Python will use dictionaries, e.g.: spv['SourceLanguage']['GLSL']
+** 
+** Some tokens act like mask values, which can be OR'd together,
+** while others are mutually exclusive.  The mask-like ones have
+** "Mask" in their name, and a parallel enum that has the shift
+** amount (1 << x) for each corresponding enumerant.
+*/
+
+#ifndef spirv_H
+#define spirv_H
+
+typedef unsigned int SpvId;
+
+#define SPV_VERSION 0x10000
+#define SPV_REVISION 2
+
+static const unsigned int SpvMagicNumber = 0x07230203;
+static const unsigned int SpvVersion = 0x00010000;
+static const unsigned int SpvRevision = 2;
+static const unsigned int SpvOpCodeMask = 0xffff;
+static const unsigned int SpvWordCountShift = 16;
+
+typedef enum SpvSourceLanguage_ {
+    SpvSourceLanguageUnknown = 0,
+    SpvSourceLanguageESSL = 1,
+    SpvSourceLanguageGLSL = 2,
+    SpvSourceLanguageOpenCL_C = 3,
+    SpvSourceLanguageOpenCL_CPP = 4,
+} SpvSourceLanguage;
+
+typedef enum SpvExecutionModel_ {
+    SpvExecutionModelVertex = 0,
+    SpvExecutionModelTessellationControl = 1,
+    SpvExecutionModelTessellationEvaluation = 2,
+    SpvExecutionModelGeometry = 3,
+    SpvExecutionModelFragment = 4,
+    SpvExecutionModelGLCompute = 5,
+    SpvExecutionModelKernel = 6,
+} SpvExecutionModel;
+
+typedef enum SpvAddressingModel_ {
+    SpvAddressingModelLogical = 0,
+    SpvAddressingModelPhysical32 = 1,
+    SpvAddressingModelPhysical64 = 2,
+} SpvAddressingModel;
+
+typedef enum SpvMemoryModel_ {
+    SpvMemoryModelSimple = 0,
+    SpvMemoryModelGLSL450 = 1,
+    SpvMemoryModelOpenCL = 2,
+} SpvMemoryModel;
+
+typedef enum SpvExecutionMode_ {
+    SpvExecutionModeInvocations = 0,
+    SpvExecutionModeSpacingEqual = 1,
+    SpvExecutionModeSpacingFractionalEven = 2,
+    SpvExecutionModeSpacingFractionalOdd = 3,
+    SpvExecutionModeVertexOrderCw = 4,
+    SpvExecutionModeVertexOrderCcw = 5,
+    SpvExecutionModePixelCenterInteger = 6,
+    SpvExecutionModeOriginUpperLeft = 7,
+    SpvExecutionModeOriginLowerLeft = 8,
+    SpvExecutionModeEarlyFragmentTests = 9,
+    SpvExecutionModePointMode = 10,
+    SpvExecutionModeXfb = 11,
+    SpvExecutionModeDepthReplacing = 12,
+    SpvExecutionModeDepthGreater = 14,
+    SpvExecutionModeDepthLess = 15,
+    SpvExecutionModeDepthUnchanged = 16,
+    SpvExecutionModeLocalSize = 17,
+    SpvExecutionModeLocalSizeHint = 18,
+    SpvExecutionModeInputPoints = 19,
+    SpvExecutionModeInputLines = 20,
+    SpvExecutionModeInputLinesAdjacency = 21,
+    SpvExecutionModeTriangles = 22,
+    SpvExecutionModeInputTrianglesAdjacency = 23,
+    SpvExecutionModeQuads = 24,
+    SpvExecutionModeIsolines = 25,
+    SpvExecutionModeOutputVertices = 26,
+    SpvExecutionModeOutputPoints = 27,
+    SpvExecutionModeOutputLineStrip = 28,
+    SpvExecutionModeOutputTriangleStrip = 29,
+    SpvExecutionModeVecTypeHint = 30,
+    SpvExecutionModeContractionOff = 31,
+} SpvExecutionMode;
+
+typedef enum SpvStorageClass_ {
+    SpvStorageClassUniformConstant = 0,
+    SpvStorageClassInput = 1,
+    SpvStorageClassUniform = 2,
+    SpvStorageClassOutput = 3,
+    SpvStorageClassWorkgroup = 4,
+    SpvStorageClassCrossWorkgroup = 5,
+    SpvStorageClassPrivate = 6,
+    SpvStorageClassFunction = 7,
+    SpvStorageClassGeneric = 8,
+    SpvStorageClassPushConstant = 9,
+    SpvStorageClassAtomicCounter = 10,
+    SpvStorageClassImage = 11,
+} SpvStorageClass;
+
+typedef enum SpvDim_ {
+    SpvDim1D = 0,
+    SpvDim2D = 1,
+    SpvDim3D = 2,
+    SpvDimCube = 3,
+    SpvDimRect = 4,
+    SpvDimBuffer = 5,
+    SpvDimSubpassData = 6,
+} SpvDim;
+
+typedef enum SpvSamplerAddressingMode_ {
+    SpvSamplerAddressingModeNone = 0,
+    SpvSamplerAddressingModeClampToEdge = 1,
+    SpvSamplerAddressingModeClamp = 2,
+    SpvSamplerAddressingModeRepeat = 3,
+    SpvSamplerAddressingModeRepeatMirrored = 4,
+} SpvSamplerAddressingMode;
+
+typedef enum SpvSamplerFilterMode_ {
+    SpvSamplerFilterModeNearest = 0,
+    SpvSamplerFilterModeLinear = 1,
+} SpvSamplerFilterMode;
+
+typedef enum SpvImageFormat_ {
+    SpvImageFormatUnknown = 0,
+    SpvImageFormatRgba32f = 1,
+    SpvImageFormatRgba16f = 2,
+    SpvImageFormatR32f = 3,
+    SpvImageFormatRgba8 = 4,
+    SpvImageFormatRgba8Snorm = 5,
+    SpvImageFormatRg32f = 6,
+    SpvImageFormatRg16f = 7,
+    SpvImageFormatR11fG11fB10f = 8,
+    SpvImageFormatR16f = 9,
+    SpvImageFormatRgba16 = 10,
+    SpvImageFormatRgb10A2 = 11,
+    SpvImageFormatRg16 = 12,
+    SpvImageFormatRg8 = 13,
+    SpvImageFormatR16 = 14,
+    SpvImageFormatR8 = 15,
+    SpvImageFormatRgba16Snorm = 16,
+    SpvImageFormatRg16Snorm = 17,
+    SpvImageFormatRg8Snorm = 18,
+    SpvImageFormatR16Snorm = 19,
+    SpvImageFormatR8Snorm = 20,
+    SpvImageFormatRgba32i = 21,
+    SpvImageFormatRgba16i = 22,
+    SpvImageFormatRgba8i = 23,
+    SpvImageFormatR32i = 24,
+    SpvImageFormatRg32i = 25,
+    SpvImageFormatRg16i = 26,
+    SpvImageFormatRg8i = 27,
+    SpvImageFormatR16i = 28,
+    SpvImageFormatR8i = 29,
+    SpvImageFormatRgba32ui = 30,
+    SpvImageFormatRgba16ui = 31,
+    SpvImageFormatRgba8ui = 32,
+    SpvImageFormatR32ui = 33,
+    SpvImageFormatRgb10a2ui = 34,
+    SpvImageFormatRg32ui = 35,
+    SpvImageFormatRg16ui = 36,
+    SpvImageFormatRg8ui = 37,
+    SpvImageFormatR16ui = 38,
+    SpvImageFormatR8ui = 39,
+} SpvImageFormat;
+
+typedef enum SpvImageChannelOrder_ {
+    SpvImageChannelOrderR = 0,
+    SpvImageChannelOrderA = 1,
+    SpvImageChannelOrderRG = 2,
+    SpvImageChannelOrderRA = 3,
+    SpvImageChannelOrderRGB = 4,
+    SpvImageChannelOrderRGBA = 5,
+    SpvImageChannelOrderBGRA = 6,
+    SpvImageChannelOrderARGB = 7,
+    SpvImageChannelOrderIntensity = 8,
+    SpvImageChannelOrderLuminance = 9,
+    SpvImageChannelOrderRx = 10,
+    SpvImageChannelOrderRGx = 11,
+    SpvImageChannelOrderRGBx = 12,
+    SpvImageChannelOrderDepth = 13,
+    SpvImageChannelOrderDepthStencil = 14,
+    SpvImageChannelOrdersRGB = 15,
+    SpvImageChannelOrdersRGBx = 16,
+    SpvImageChannelOrdersRGBA = 17,
+    SpvImageChannelOrdersBGRA = 18,
+} SpvImageChannelOrder;
+
+typedef enum SpvImageChannelDataType_ {
+    SpvImageChannelDataTypeSnormInt8 = 0,
+    SpvImageChannelDataTypeSnormInt16 = 1,
+    SpvImageChannelDataTypeUnormInt8 = 2,
+    SpvImageChannelDataTypeUnormInt16 = 3,
+    SpvImageChannelDataTypeUnormShort565 = 4,
+    SpvImageChannelDataTypeUnormShort555 = 5,
+    SpvImageChannelDataTypeUnormInt101010 = 6,
+    SpvImageChannelDataTypeSignedInt8 = 7,
+    SpvImageChannelDataTypeSignedInt16 = 8,
+    SpvImageChannelDataTypeSignedInt32 = 9,
+    SpvImageChannelDataTypeUnsignedInt8 = 10,
+    SpvImageChannelDataTypeUnsignedInt16 = 11,
+    SpvImageChannelDataTypeUnsignedInt32 = 12,
+    SpvImageChannelDataTypeHalfFloat = 13,
+    SpvImageChannelDataTypeFloat = 14,
+    SpvImageChannelDataTypeUnormInt24 = 15,
+    SpvImageChannelDataTypeUnormInt101010_2 = 16,
+} SpvImageChannelDataType;
+
+typedef enum SpvImageOperandsShift_ {
+    SpvImageOperandsBiasShift = 0,
+    SpvImageOperandsLodShift = 1,
+    SpvImageOperandsGradShift = 2,
+    SpvImageOperandsConstOffsetShift = 3,
+    SpvImageOperandsOffsetShift = 4,
+    SpvImageOperandsConstOffsetsShift = 5,
+    SpvImageOperandsSampleShift = 6,
+    SpvImageOperandsMinLodShift = 7,
+} SpvImageOperandsShift;
+
+typedef enum SpvImageOperandsMask_ {
+    SpvImageOperandsMaskNone = 0,
+    SpvImageOperandsBiasMask = 0x00000001,
+    SpvImageOperandsLodMask = 0x00000002,
+    SpvImageOperandsGradMask = 0x00000004,
+    SpvImageOperandsConstOffsetMask = 0x00000008,
+    SpvImageOperandsOffsetMask = 0x00000010,
+    SpvImageOperandsConstOffsetsMask = 0x00000020,
+    SpvImageOperandsSampleMask = 0x00000040,
+    SpvImageOperandsMinLodMask = 0x00000080,
+} SpvImageOperandsMask;
+
+typedef enum SpvFPFastMathModeShift_ {
+    SpvFPFastMathModeNotNaNShift = 0,
+    SpvFPFastMathModeNotInfShift = 1,
+    SpvFPFastMathModeNSZShift = 2,
+    SpvFPFastMathModeAllowRecipShift = 3,
+    SpvFPFastMathModeFastShift = 4,
+} SpvFPFastMathModeShift;
+
+typedef enum SpvFPFastMathModeMask_ {
+    SpvFPFastMathModeMaskNone = 0,
+    SpvFPFastMathModeNotNaNMask = 0x00000001,
+    SpvFPFastMathModeNotInfMask = 0x00000002,
+    SpvFPFastMathModeNSZMask = 0x00000004,
+    SpvFPFastMathModeAllowRecipMask = 0x00000008,
+    SpvFPFastMathModeFastMask = 0x00000010,
+} SpvFPFastMathModeMask;
+
+typedef enum SpvFPRoundingMode_ {
+    SpvFPRoundingModeRTE = 0,
+    SpvFPRoundingModeRTZ = 1,
+    SpvFPRoundingModeRTP = 2,
+    SpvFPRoundingModeRTN = 3,
+} SpvFPRoundingMode;
+
+typedef enum SpvLinkageType_ {
+    SpvLinkageTypeExport = 0,
+    SpvLinkageTypeImport = 1,
+} SpvLinkageType;
+
+typedef enum SpvAccessQualifier_ {
+    SpvAccessQualifierReadOnly = 0,
+    SpvAccessQualifierWriteOnly = 1,
+    SpvAccessQualifierReadWrite = 2,
+} SpvAccessQualifier;
+
+typedef enum SpvFunctionParameterAttribute_ {
+    SpvFunctionParameterAttributeZext = 0,
+    SpvFunctionParameterAttributeSext = 1,
+    SpvFunctionParameterAttributeByVal = 2,
+    SpvFunctionParameterAttributeSret = 3,
+    SpvFunctionParameterAttributeNoAlias = 4,
+    SpvFunctionParameterAttributeNoCapture = 5,
+    SpvFunctionParameterAttributeNoWrite = 6,
+    SpvFunctionParameterAttributeNoReadWrite = 7,
+} SpvFunctionParameterAttribute;
+
+typedef enum SpvDecoration_ {
+    SpvDecorationRelaxedPrecision = 0,
+    SpvDecorationSpecId = 1,
+    SpvDecorationBlock = 2,
+    SpvDecorationBufferBlock = 3,
+    SpvDecorationRowMajor = 4,
+    SpvDecorationColMajor = 5,
+    SpvDecorationArrayStride = 6,
+    SpvDecorationMatrixStride = 7,
+    SpvDecorationGLSLShared = 8,
+    SpvDecorationGLSLPacked = 9,
+    SpvDecorationCPacked = 10,
+    SpvDecorationBuiltIn = 11,
+    SpvDecorationNoPerspective = 13,
+    SpvDecorationFlat = 14,
+    SpvDecorationPatch = 15,
+    SpvDecorationCentroid = 16,
+    SpvDecorationSample = 17,
+    SpvDecorationInvariant = 18,
+    SpvDecorationRestrict = 19,
+    SpvDecorationAliased = 20,
+    SpvDecorationVolatile = 21,
+    SpvDecorationConstant = 22,
+    SpvDecorationCoherent = 23,
+    SpvDecorationNonWritable = 24,
+    SpvDecorationNonReadable = 25,
+    SpvDecorationUniform = 26,
+    SpvDecorationSaturatedConversion = 28,
+    SpvDecorationStream = 29,
+    SpvDecorationLocation = 30,
+    SpvDecorationComponent = 31,
+    SpvDecorationIndex = 32,
+    SpvDecorationBinding = 33,
+    SpvDecorationDescriptorSet = 34,
+    SpvDecorationOffset = 35,
+    SpvDecorationXfbBuffer = 36,
+    SpvDecorationXfbStride = 37,
+    SpvDecorationFuncParamAttr = 38,
+    SpvDecorationFPRoundingMode = 39,
+    SpvDecorationFPFastMathMode = 40,
+    SpvDecorationLinkageAttributes = 41,
+    SpvDecorationNoContraction = 42,
+    SpvDecorationInputAttachmentIndex = 43,
+    SpvDecorationAlignment = 44,
+} SpvDecoration;
+
+typedef enum SpvBuiltIn_ {
+    SpvBuiltInPosition = 0,
+    SpvBuiltInPointSize = 1,
+    SpvBuiltInClipDistance = 3,
+    SpvBuiltInCullDistance = 4,
+    SpvBuiltInVertexId = 5,
+    SpvBuiltInInstanceId = 6,
+    SpvBuiltInPrimitiveId = 7,
+    SpvBuiltInInvocationId = 8,
+    SpvBuiltInLayer = 9,
+    SpvBuiltInViewportIndex = 10,
+    SpvBuiltInTessLevelOuter = 11,
+    SpvBuiltInTessLevelInner = 12,
+    SpvBuiltInTessCoord = 13,
+    SpvBuiltInPatchVertices = 14,
+    SpvBuiltInFragCoord = 15,
+    SpvBuiltInPointCoord = 16,
+    SpvBuiltInFrontFacing = 17,
+    SpvBuiltInSampleId = 18,
+    SpvBuiltInSamplePosition = 19,
+    SpvBuiltInSampleMask = 20,
+    SpvBuiltInFragDepth = 22,
+    SpvBuiltInHelperInvocation = 23,
+    SpvBuiltInNumWorkgroups = 24,
+    SpvBuiltInWorkgroupSize = 25,
+    SpvBuiltInWorkgroupId = 26,
+    SpvBuiltInLocalInvocationId = 27,
+    SpvBuiltInGlobalInvocationId = 28,
+    SpvBuiltInLocalInvocationIndex = 29,
+    SpvBuiltInWorkDim = 30,
+    SpvBuiltInGlobalSize = 31,
+    SpvBuiltInEnqueuedWorkgroupSize = 32,
+    SpvBuiltInGlobalOffset = 33,
+    SpvBuiltInGlobalLinearId = 34,
+    SpvBuiltInSubgroupSize = 36,
+    SpvBuiltInSubgroupMaxSize = 37,
+    SpvBuiltInNumSubgroups = 38,
+    SpvBuiltInNumEnqueuedSubgroups = 39,
+    SpvBuiltInSubgroupId = 40,
+    SpvBuiltInSubgroupLocalInvocationId = 41,
+    SpvBuiltInVertexIndex = 42,
+    SpvBuiltInInstanceIndex = 43,
+} SpvBuiltIn;
+
+typedef enum SpvSelectionControlShift_ {
+    SpvSelectionControlFlattenShift = 0,
+    SpvSelectionControlDontFlattenShift = 1,
+} SpvSelectionControlShift;
+
+typedef enum SpvSelectionControlMask_ {
+    SpvSelectionControlMaskNone = 0,
+    SpvSelectionControlFlattenMask = 0x00000001,
+    SpvSelectionControlDontFlattenMask = 0x00000002,
+} SpvSelectionControlMask;
+
+typedef enum SpvLoopControlShift_ {
+    SpvLoopControlUnrollShift = 0,
+    SpvLoopControlDontUnrollShift = 1,
+} SpvLoopControlShift;
+
+typedef enum SpvLoopControlMask_ {
+    SpvLoopControlMaskNone = 0,
+    SpvLoopControlUnrollMask = 0x00000001,
+    SpvLoopControlDontUnrollMask = 0x00000002,
+} SpvLoopControlMask;
+
+typedef enum SpvFunctionControlShift_ {
+    SpvFunctionControlInlineShift = 0,
+    SpvFunctionControlDontInlineShift = 1,
+    SpvFunctionControlPureShift = 2,
+    SpvFunctionControlConstShift = 3,
+} SpvFunctionControlShift;
+
+typedef enum SpvFunctionControlMask_ {
+    SpvFunctionControlMaskNone = 0,
+    SpvFunctionControlInlineMask = 0x00000001,
+    SpvFunctionControlDontInlineMask = 0x00000002,
+    SpvFunctionControlPureMask = 0x00000004,
+    SpvFunctionControlConstMask = 0x00000008,
+} SpvFunctionControlMask;
+
+typedef enum SpvMemorySemanticsShift_ {
+    SpvMemorySemanticsAcquireShift = 1,
+    SpvMemorySemanticsReleaseShift = 2,
+    SpvMemorySemanticsAcquireReleaseShift = 3,
+    SpvMemorySemanticsSequentiallyConsistentShift = 4,
+    SpvMemorySemanticsUniformMemoryShift = 6,
+    SpvMemorySemanticsSubgroupMemoryShift = 7,
+    SpvMemorySemanticsWorkgroupMemoryShift = 8,
+    SpvMemorySemanticsCrossWorkgroupMemoryShift = 9,
+    SpvMemorySemanticsAtomicCounterMemoryShift = 10,
+    SpvMemorySemanticsImageMemoryShift = 11,
+} SpvMemorySemanticsShift;
+
+typedef enum SpvMemorySemanticsMask_ {
+    SpvMemorySemanticsMaskNone = 0,
+    SpvMemorySemanticsAcquireMask = 0x00000002,
+    SpvMemorySemanticsReleaseMask = 0x00000004,
+    SpvMemorySemanticsAcquireReleaseMask = 0x00000008,
+    SpvMemorySemanticsSequentiallyConsistentMask = 0x00000010,
+    SpvMemorySemanticsUniformMemoryMask = 0x00000040,
+    SpvMemorySemanticsSubgroupMemoryMask = 0x00000080,
+    SpvMemorySemanticsWorkgroupMemoryMask = 0x00000100,
+    SpvMemorySemanticsCrossWorkgroupMemoryMask = 0x00000200,
+    SpvMemorySemanticsAtomicCounterMemoryMask = 0x00000400,
+    SpvMemorySemanticsImageMemoryMask = 0x00000800,
+} SpvMemorySemanticsMask;
+
+typedef enum SpvMemoryAccessShift_ {
+    SpvMemoryAccessVolatileShift = 0,
+    SpvMemoryAccessAlignedShift = 1,
+    SpvMemoryAccessNontemporalShift = 2,
+} SpvMemoryAccessShift;
+
+typedef enum SpvMemoryAccessMask_ {
+    SpvMemoryAccessMaskNone = 0,
+    SpvMemoryAccessVolatileMask = 0x00000001,
+    SpvMemoryAccessAlignedMask = 0x00000002,
+    SpvMemoryAccessNontemporalMask = 0x00000004,
+} SpvMemoryAccessMask;
+
+typedef enum SpvScope_ {
+    SpvScopeCrossDevice = 0,
+    SpvScopeDevice = 1,
+    SpvScopeWorkgroup = 2,
+    SpvScopeSubgroup = 3,
+    SpvScopeInvocation = 4,
+} SpvScope;
+
+typedef enum SpvGroupOperation_ {
+    SpvGroupOperationReduce = 0,
+    SpvGroupOperationInclusiveScan = 1,
+    SpvGroupOperationExclusiveScan = 2,
+} SpvGroupOperation;
+
+typedef enum SpvKernelEnqueueFlags_ {
+    SpvKernelEnqueueFlagsNoWait = 0,
+    SpvKernelEnqueueFlagsWaitKernel = 1,
+    SpvKernelEnqueueFlagsWaitWorkGroup = 2,
+} SpvKernelEnqueueFlags;
+
+typedef enum SpvKernelProfilingInfoShift_ {
+    SpvKernelProfilingInfoCmdExecTimeShift = 0,
+} SpvKernelProfilingInfoShift;
+
+typedef enum SpvKernelProfilingInfoMask_ {
+    SpvKernelProfilingInfoMaskNone = 0,
+    SpvKernelProfilingInfoCmdExecTimeMask = 0x00000001,
+} SpvKernelProfilingInfoMask;
+
+typedef enum SpvCapability_ {
+    SpvCapabilityMatrix = 0,
+    SpvCapabilityShader = 1,
+    SpvCapabilityGeometry = 2,
+    SpvCapabilityTessellation = 3,
+    SpvCapabilityAddresses = 4,
+    SpvCapabilityLinkage = 5,
+    SpvCapabilityKernel = 6,
+    SpvCapabilityVector16 = 7,
+    SpvCapabilityFloat16Buffer = 8,
+    SpvCapabilityFloat16 = 9,
+    SpvCapabilityFloat64 = 10,
+    SpvCapabilityInt64 = 11,
+    SpvCapabilityInt64Atomics = 12,
+    SpvCapabilityImageBasic = 13,
+    SpvCapabilityImageReadWrite = 14,
+    SpvCapabilityImageMipmap = 15,
+    SpvCapabilityPipes = 17,
+    SpvCapabilityGroups = 18,
+    SpvCapabilityDeviceEnqueue = 19,
+    SpvCapabilityLiteralSampler = 20,
+    SpvCapabilityAtomicStorage = 21,
+    SpvCapabilityInt16 = 22,
+    SpvCapabilityTessellationPointSize = 23,
+    SpvCapabilityGeometryPointSize = 24,
+    SpvCapabilityImageGatherExtended = 25,
+    SpvCapabilityStorageImageMultisample = 27,
+    SpvCapabilityUniformBufferArrayDynamicIndexing = 28,
+    SpvCapabilitySampledImageArrayDynamicIndexing = 29,
+    SpvCapabilityStorageBufferArrayDynamicIndexing = 30,
+    SpvCapabilityStorageImageArrayDynamicIndexing = 31,
+    SpvCapabilityClipDistance = 32,
+    SpvCapabilityCullDistance = 33,
+    SpvCapabilityImageCubeArray = 34,
+    SpvCapabilitySampleRateShading = 35,
+    SpvCapabilityImageRect = 36,
+    SpvCapabilitySampledRect = 37,
+    SpvCapabilityGenericPointer = 38,
+    SpvCapabilityInt8 = 39,
+    SpvCapabilityInputAttachment = 40,
+    SpvCapabilitySparseResidency = 41,
+    SpvCapabilityMinLod = 42,
+    SpvCapabilitySampled1D = 43,
+    SpvCapabilityImage1D = 44,
+    SpvCapabilitySampledCubeArray = 45,
+    SpvCapabilitySampledBuffer = 46,
+    SpvCapabilityImageBuffer = 47,
+    SpvCapabilityImageMSArray = 48,
+    SpvCapabilityStorageImageExtendedFormats = 49,
+    SpvCapabilityImageQuery = 50,
+    SpvCapabilityDerivativeControl = 51,
+    SpvCapabilityInterpolationFunction = 52,
+    SpvCapabilityTransformFeedback = 53,
+    SpvCapabilityGeometryStreams = 54,
+    SpvCapabilityStorageImageReadWithoutFormat = 55,
+    SpvCapabilityStorageImageWriteWithoutFormat = 56,
+    SpvCapabilityMultiViewport = 57,
+} SpvCapability;
+
+typedef enum SpvOp_ {
+    SpvOpNop = 0,
+    SpvOpUndef = 1,
+    SpvOpSourceContinued = 2,
+    SpvOpSource = 3,
+    SpvOpSourceExtension = 4,
+    SpvOpName = 5,
+    SpvOpMemberName = 6,
+    SpvOpString = 7,
+    SpvOpLine = 8,
+    SpvOpExtension = 10,
+    SpvOpExtInstImport = 11,
+    SpvOpExtInst = 12,
+    SpvOpMemoryModel = 14,
+    SpvOpEntryPoint = 15,
+    SpvOpExecutionMode = 16,
+    SpvOpCapability = 17,
+    SpvOpTypeVoid = 19,
+    SpvOpTypeBool = 20,
+    SpvOpTypeInt = 21,
+    SpvOpTypeFloat = 22,
+    SpvOpTypeVector = 23,
+    SpvOpTypeMatrix = 24,
+    SpvOpTypeImage = 25,
+    SpvOpTypeSampler = 26,
+    SpvOpTypeSampledImage = 27,
+    SpvOpTypeArray = 28,
+    SpvOpTypeRuntimeArray = 29,
+    SpvOpTypeStruct = 30,
+    SpvOpTypeOpaque = 31,
+    SpvOpTypePointer = 32,
+    SpvOpTypeFunction = 33,
+    SpvOpTypeEvent = 34,
+    SpvOpTypeDeviceEvent = 35,
+    SpvOpTypeReserveId = 36,
+    SpvOpTypeQueue = 37,
+    SpvOpTypePipe = 38,
+    SpvOpTypeForwardPointer = 39,
+    SpvOpConstantTrue = 41,
+    SpvOpConstantFalse = 42,
+    SpvOpConstant = 43,
+    SpvOpConstantComposite = 44,
+    SpvOpConstantSampler = 45,
+    SpvOpConstantNull = 46,
+    SpvOpSpecConstantTrue = 48,
+    SpvOpSpecConstantFalse = 49,
+    SpvOpSpecConstant = 50,
+    SpvOpSpecConstantComposite = 51,
+    SpvOpSpecConstantOp = 52,
+    SpvOpFunction = 54,
+    SpvOpFunctionParameter = 55,
+    SpvOpFunctionEnd = 56,
+    SpvOpFunctionCall = 57,
+    SpvOpVariable = 59,
+    SpvOpImageTexelPointer = 60,
+    SpvOpLoad = 61,
+    SpvOpStore = 62,
+    SpvOpCopyMemory = 63,
+    SpvOpCopyMemorySized = 64,
+    SpvOpAccessChain = 65,
+    SpvOpInBoundsAccessChain = 66,
+    SpvOpPtrAccessChain = 67,
+    SpvOpArrayLength = 68,
+    SpvOpGenericPtrMemSemantics = 69,
+    SpvOpInBoundsPtrAccessChain = 70,
+    SpvOpDecorate = 71,
+    SpvOpMemberDecorate = 72,
+    SpvOpDecorationGroup = 73,
+    SpvOpGroupDecorate = 74,
+    SpvOpGroupMemberDecorate = 75,
+    SpvOpVectorExtractDynamic = 77,
+    SpvOpVectorInsertDynamic = 78,
+    SpvOpVectorShuffle = 79,
+    SpvOpCompositeConstruct = 80,
+    SpvOpCompositeExtract = 81,
+    SpvOpCompositeInsert = 82,
+    SpvOpCopyObject = 83,
+    SpvOpTranspose = 84,
+    SpvOpSampledImage = 86,
+    SpvOpImageSampleImplicitLod = 87,
+    SpvOpImageSampleExplicitLod = 88,
+    SpvOpImageSampleDrefImplicitLod = 89,
+    SpvOpImageSampleDrefExplicitLod = 90,
+    SpvOpImageSampleProjImplicitLod = 91,
+    SpvOpImageSampleProjExplicitLod = 92,
+    SpvOpImageSampleProjDrefImplicitLod = 93,
+    SpvOpImageSampleProjDrefExplicitLod = 94,
+    SpvOpImageFetch = 95,
+    SpvOpImageGather = 96,
+    SpvOpImageDrefGather = 97,
+    SpvOpImageRead = 98,
+    SpvOpImageWrite = 99,
+    SpvOpImage = 100,
+    SpvOpImageQueryFormat = 101,
+    SpvOpImageQueryOrder = 102,
+    SpvOpImageQuerySizeLod = 103,
+    SpvOpImageQuerySize = 104,
+    SpvOpImageQueryLod = 105,
+    SpvOpImageQueryLevels = 106,
+    SpvOpImageQuerySamples = 107,
+    SpvOpConvertFToU = 109,
+    SpvOpConvertFToS = 110,
+    SpvOpConvertSToF = 111,
+    SpvOpConvertUToF = 112,
+    SpvOpUConvert = 113,
+    SpvOpSConvert = 114,
+    SpvOpFConvert = 115,
+    SpvOpQuantizeToF16 = 116,
+    SpvOpConvertPtrToU = 117,
+    SpvOpSatConvertSToU = 118,
+    SpvOpSatConvertUToS = 119,
+    SpvOpConvertUToPtr = 120,
+    SpvOpPtrCastToGeneric = 121,
+    SpvOpGenericCastToPtr = 122,
+    SpvOpGenericCastToPtrExplicit = 123,
+    SpvOpBitcast = 124,
+    SpvOpSNegate = 126,
+    SpvOpFNegate = 127,
+    SpvOpIAdd = 128,
+    SpvOpFAdd = 129,
+    SpvOpISub = 130,
+    SpvOpFSub = 131,
+    SpvOpIMul = 132,
+    SpvOpFMul = 133,
+    SpvOpUDiv = 134,
+    SpvOpSDiv = 135,
+    SpvOpFDiv = 136,
+    SpvOpUMod = 137,
+    SpvOpSRem = 138,
+    SpvOpSMod = 139,
+    SpvOpFRem = 140,
+    SpvOpFMod = 141,
+    SpvOpVectorTimesScalar = 142,
+    SpvOpMatrixTimesScalar = 143,
+    SpvOpVectorTimesMatrix = 144,
+    SpvOpMatrixTimesVector = 145,
+    SpvOpMatrixTimesMatrix = 146,
+    SpvOpOuterProduct = 147,
+    SpvOpDot = 148,
+    SpvOpIAddCarry = 149,
+    SpvOpISubBorrow = 150,
+    SpvOpUMulExtended = 151,
+    SpvOpSMulExtended = 152,
+    SpvOpAny = 154,
+    SpvOpAll = 155,
+    SpvOpIsNan = 156,
+    SpvOpIsInf = 157,
+    SpvOpIsFinite = 158,
+    SpvOpIsNormal = 159,
+    SpvOpSignBitSet = 160,
+    SpvOpLessOrGreater = 161,
+    SpvOpOrdered = 162,
+    SpvOpUnordered = 163,
+    SpvOpLogicalEqual = 164,
+    SpvOpLogicalNotEqual = 165,
+    SpvOpLogicalOr = 166,
+    SpvOpLogicalAnd = 167,
+    SpvOpLogicalNot = 168,
+    SpvOpSelect = 169,
+    SpvOpIEqual = 170,
+    SpvOpINotEqual = 171,
+    SpvOpUGreaterThan = 172,
+    SpvOpSGreaterThan = 173,
+    SpvOpUGreaterThanEqual = 174,
+    SpvOpSGreaterThanEqual = 175,
+    SpvOpULessThan = 176,
+    SpvOpSLessThan = 177,
+    SpvOpULessThanEqual = 178,
+    SpvOpSLessThanEqual = 179,
+    SpvOpFOrdEqual = 180,
+    SpvOpFUnordEqual = 181,
+    SpvOpFOrdNotEqual = 182,
+    SpvOpFUnordNotEqual = 183,
+    SpvOpFOrdLessThan = 184,
+    SpvOpFUnordLessThan = 185,
+    SpvOpFOrdGreaterThan = 186,
+    SpvOpFUnordGreaterThan = 187,
+    SpvOpFOrdLessThanEqual = 188,
+    SpvOpFUnordLessThanEqual = 189,
+    SpvOpFOrdGreaterThanEqual = 190,
+    SpvOpFUnordGreaterThanEqual = 191,
+    SpvOpShiftRightLogical = 194,
+    SpvOpShiftRightArithmetic = 195,
+    SpvOpShiftLeftLogical = 196,
+    SpvOpBitwiseOr = 197,
+    SpvOpBitwiseXor = 198,
+    SpvOpBitwiseAnd = 199,
+    SpvOpNot = 200,
+    SpvOpBitFieldInsert = 201,
+    SpvOpBitFieldSExtract = 202,
+    SpvOpBitFieldUExtract = 203,
+    SpvOpBitReverse = 204,
+    SpvOpBitCount = 205,
+    SpvOpDPdx = 207,
+    SpvOpDPdy = 208,
+    SpvOpFwidth = 209,
+    SpvOpDPdxFine = 210,
+    SpvOpDPdyFine = 211,
+    SpvOpFwidthFine = 212,
+    SpvOpDPdxCoarse = 213,
+    SpvOpDPdyCoarse = 214,
+    SpvOpFwidthCoarse = 215,
+    SpvOpEmitVertex = 218,
+    SpvOpEndPrimitive = 219,
+    SpvOpEmitStreamVertex = 220,
+    SpvOpEndStreamPrimitive = 221,
+    SpvOpControlBarrier = 224,
+    SpvOpMemoryBarrier = 225,
+    SpvOpAtomicLoad = 227,
+    SpvOpAtomicStore = 228,
+    SpvOpAtomicExchange = 229,
+    SpvOpAtomicCompareExchange = 230,
+    SpvOpAtomicCompareExchangeWeak = 231,
+    SpvOpAtomicIIncrement = 232,
+    SpvOpAtomicIDecrement = 233,
+    SpvOpAtomicIAdd = 234,
+    SpvOpAtomicISub = 235,
+    SpvOpAtomicSMin = 236,
+    SpvOpAtomicUMin = 237,
+    SpvOpAtomicSMax = 238,
+    SpvOpAtomicUMax = 239,
+    SpvOpAtomicAnd = 240,
+    SpvOpAtomicOr = 241,
+    SpvOpAtomicXor = 242,
+    SpvOpPhi = 245,
+    SpvOpLoopMerge = 246,
+    SpvOpSelectionMerge = 247,
+    SpvOpLabel = 248,
+    SpvOpBranch = 249,
+    SpvOpBranchConditional = 250,
+    SpvOpSwitch = 251,
+    SpvOpKill = 252,
+    SpvOpReturn = 253,
+    SpvOpReturnValue = 254,
+    SpvOpUnreachable = 255,
+    SpvOpLifetimeStart = 256,
+    SpvOpLifetimeStop = 257,
+    SpvOpGroupAsyncCopy = 259,
+    SpvOpGroupWaitEvents = 260,
+    SpvOpGroupAll = 261,
+    SpvOpGroupAny = 262,
+    SpvOpGroupBroadcast = 263,
+    SpvOpGroupIAdd = 264,
+    SpvOpGroupFAdd = 265,
+    SpvOpGroupFMin = 266,
+    SpvOpGroupUMin = 267,
+    SpvOpGroupSMin = 268,
+    SpvOpGroupFMax = 269,
+    SpvOpGroupUMax = 270,
+    SpvOpGroupSMax = 271,
+    SpvOpReadPipe = 274,
+    SpvOpWritePipe = 275,
+    SpvOpReservedReadPipe = 276,
+    SpvOpReservedWritePipe = 277,
+    SpvOpReserveReadPipePackets = 278,
+    SpvOpReserveWritePipePackets = 279,
+    SpvOpCommitReadPipe = 280,
+    SpvOpCommitWritePipe = 281,
+    SpvOpIsValidReserveId = 282,
+    SpvOpGetNumPipePackets = 283,
+    SpvOpGetMaxPipePackets = 284,
+    SpvOpGroupReserveReadPipePackets = 285,
+    SpvOpGroupReserveWritePipePackets = 286,
+    SpvOpGroupCommitReadPipe = 287,
+    SpvOpGroupCommitWritePipe = 288,
+    SpvOpEnqueueMarker = 291,
+    SpvOpEnqueueKernel = 292,
+    SpvOpGetKernelNDrangeSubGroupCount = 293,
+    SpvOpGetKernelNDrangeMaxSubGroupSize = 294,
+    SpvOpGetKernelWorkGroupSize = 295,
+    SpvOpGetKernelPreferredWorkGroupSizeMultiple = 296,
+    SpvOpRetainEvent = 297,
+    SpvOpReleaseEvent = 298,
+    SpvOpCreateUserEvent = 299,
+    SpvOpIsValidEvent = 300,
+    SpvOpSetUserEventStatus = 301,
+    SpvOpCaptureEventProfilingInfo = 302,
+    SpvOpGetDefaultQueue = 303,
+    SpvOpBuildNDRange = 304,
+    SpvOpImageSparseSampleImplicitLod = 305,
+    SpvOpImageSparseSampleExplicitLod = 306,
+    SpvOpImageSparseSampleDrefImplicitLod = 307,
+    SpvOpImageSparseSampleDrefExplicitLod = 308,
+    SpvOpImageSparseSampleProjImplicitLod = 309,
+    SpvOpImageSparseSampleProjExplicitLod = 310,
+    SpvOpImageSparseSampleProjDrefImplicitLod = 311,
+    SpvOpImageSparseSampleProjDrefExplicitLod = 312,
+    SpvOpImageSparseFetch = 313,
+    SpvOpImageSparseGather = 314,
+    SpvOpImageSparseDrefGather = 315,
+    SpvOpImageSparseTexelsResident = 316,
+    SpvOpNoLine = 317,
+    SpvOpAtomicFlagTestAndSet = 318,
+    SpvOpAtomicFlagClear = 319,
+} SpvOp;
+
+#endif  // #ifndef spirv_H
+
diff --git a/src/glsl/nir/spirv/spirv_to_nir.c b/src/glsl/nir/spirv/spirv_to_nir.c

new file mode 100644 (file)

index 0000000..d021122
--- /dev/null
+++ b/src/glsl/nir/spirv/spirv_to_nir.c
@@ -0,0 +1,3861 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand (jason@jlekstrand.net)
+ *
+ */
+
+#include "vtn_private.h"
+#include "nir/nir_vla.h"
+#include "nir/nir_control_flow.h"
+#include "nir/nir_constant_expressions.h"
+
+static struct vtn_ssa_value *
+vtn_undef_ssa_value(struct vtn_builder *b, const struct glsl_type *type)
+{
+   struct vtn_ssa_value *val = rzalloc(b, struct vtn_ssa_value);
+   val->type = type;
+
+   if (glsl_type_is_vector_or_scalar(type)) {
+      unsigned num_components = glsl_get_vector_elements(val->type);
+      nir_ssa_undef_instr *undef =
+         nir_ssa_undef_instr_create(b->shader, num_components);
+
+      nir_instr_insert_before_cf_list(&b->impl->body, &undef->instr);
+      val->def = &undef->def;
+   } else {
+      unsigned elems = glsl_get_length(val->type);
+      val->elems = ralloc_array(b, struct vtn_ssa_value *, elems);
+      if (glsl_type_is_matrix(type)) {
+         const struct glsl_type *elem_type =
+            glsl_vector_type(glsl_get_base_type(type),
+                             glsl_get_vector_elements(type));
+
+         for (unsigned i = 0; i < elems; i++)
+            val->elems[i] = vtn_undef_ssa_value(b, elem_type);
+      } else if (glsl_type_is_array(type)) {
+         const struct glsl_type *elem_type = glsl_get_array_element(type);
+         for (unsigned i = 0; i < elems; i++)
+            val->elems[i] = vtn_undef_ssa_value(b, elem_type);
+      } else {
+         for (unsigned i = 0; i < elems; i++) {
+            const struct glsl_type *elem_type = glsl_get_struct_field(type, i);
+            val->elems[i] = vtn_undef_ssa_value(b, elem_type);
+         }
+      }
+   }
+
+   return val;
+}
+
+static struct vtn_ssa_value *
+vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
+                    const struct glsl_type *type)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(b->const_table, constant);
+
+   if (entry)
+      return entry->data;
+
+   struct vtn_ssa_value *val = rzalloc(b, struct vtn_ssa_value);
+   val->type = type;
+
+   switch (glsl_get_base_type(type)) {
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+      if (glsl_type_is_vector_or_scalar(type)) {
+         unsigned num_components = glsl_get_vector_elements(val->type);
+         nir_load_const_instr *load =
+            nir_load_const_instr_create(b->shader, num_components);
+
+         for (unsigned i = 0; i < num_components; i++)
+            load->value.u[i] = constant->value.u[i];
+
+         nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
+         val->def = &load->def;
+      } else {
+         assert(glsl_type_is_matrix(type));
+         unsigned rows = glsl_get_vector_elements(val->type);
+         unsigned columns = glsl_get_matrix_columns(val->type);
+         val->elems = ralloc_array(b, struct vtn_ssa_value *, columns);
+
+         for (unsigned i = 0; i < columns; i++) {
+            struct vtn_ssa_value *col_val = rzalloc(b, struct vtn_ssa_value);
+            col_val->type = glsl_get_column_type(val->type);
+            nir_load_const_instr *load =
+               nir_load_const_instr_create(b->shader, rows);
+
+            for (unsigned j = 0; j < rows; j++)
+               load->value.u[j] = constant->value.u[rows * i + j];
+
+            nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
+            col_val->def = &load->def;
+
+            val->elems[i] = col_val;
+         }
+      }
+      break;
+
+   case GLSL_TYPE_ARRAY: {
+      unsigned elems = glsl_get_length(val->type);
+      val->elems = ralloc_array(b, struct vtn_ssa_value *, elems);
+      const struct glsl_type *elem_type = glsl_get_array_element(val->type);
+      for (unsigned i = 0; i < elems; i++)
+         val->elems[i] = vtn_const_ssa_value(b, constant->elements[i],
+                                             elem_type);
+      break;
+   }
+
+   case GLSL_TYPE_STRUCT: {
+      unsigned elems = glsl_get_length(val->type);
+      val->elems = ralloc_array(b, struct vtn_ssa_value *, elems);
+      for (unsigned i = 0; i < elems; i++) {
+         const struct glsl_type *elem_type =
+            glsl_get_struct_field(val->type, i);
+         val->elems[i] = vtn_const_ssa_value(b, constant->elements[i],
+                                             elem_type);
+      }
+      break;
+   }
+
+   default:
+      unreachable("bad constant type");
+   }
+
+   return val;
+}
+
+struct vtn_ssa_value *
+vtn_ssa_value(struct vtn_builder *b, uint32_t value_id)
+{
+   struct vtn_value *val = vtn_untyped_value(b, value_id);
+   switch (val->value_type) {
+   case vtn_value_type_undef:
+      return vtn_undef_ssa_value(b, val->type->type);
+
+   case vtn_value_type_constant:
+      return vtn_const_ssa_value(b, val->constant, val->const_type);
+
+   case vtn_value_type_ssa:
+      return val->ssa;
+
+   case vtn_value_type_deref:
+      /* This is needed for function parameters */
+      return vtn_variable_load(b, val->deref, val->deref_type);
+
+   default:
+      unreachable("Invalid type for an SSA value");
+   }
+}
+
+static char *
+vtn_string_literal(struct vtn_builder *b, const uint32_t *words,
+                   unsigned word_count, unsigned *words_used)
+{
+   char *dup = ralloc_strndup(b, (char *)words, word_count * sizeof(*words));
+   if (words_used) {
+      /* Ammount of space taken by the string (including the null) */
+      unsigned len = strlen(dup) + 1;
+      *words_used = DIV_ROUND_UP(len, sizeof(*words));
+   }
+   return dup;
+}
+
+const uint32_t *
+vtn_foreach_instruction(struct vtn_builder *b, const uint32_t *start,
+                        const uint32_t *end, vtn_instruction_handler handler)
+{
+   b->file = NULL;
+   b->line = -1;
+   b->col = -1;
+
+   const uint32_t *w = start;
+   while (w < end) {
+      SpvOp opcode = w[0] & SpvOpCodeMask;
+      unsigned count = w[0] >> SpvWordCountShift;
+      assert(count >= 1 && w + count <= end);
+
+      switch (opcode) {
+      case SpvOpNop:
+         break; /* Do nothing */
+
+      case SpvOpLine:
+         b->file = vtn_value(b, w[1], vtn_value_type_string)->str;
+         b->line = w[2];
+         b->col = w[3];
+         break;
+
+      case SpvOpNoLine:
+         b->file = NULL;
+         b->line = -1;
+         b->col = -1;
+         break;
+
+      default:
+         if (!handler(b, opcode, w, count))
+            return w;
+         break;
+      }
+
+      w += count;
+   }
+   assert(w == end);
+   return w;
+}
+
+static void
+vtn_handle_extension(struct vtn_builder *b, SpvOp opcode,
+                     const uint32_t *w, unsigned count)
+{
+   switch (opcode) {
+   case SpvOpExtInstImport: {
+      struct vtn_value *val = vtn_push_value(b, w[1], vtn_value_type_extension);
+      if (strcmp((const char *)&w[2], "GLSL.std.450") == 0) {
+         val->ext_handler = vtn_handle_glsl450_instruction;
+      } else {
+         assert(!"Unsupported extension");
+      }
+      break;
+   }
+
+   case SpvOpExtInst: {
+      struct vtn_value *val = vtn_value(b, w[3], vtn_value_type_extension);
+      bool handled = val->ext_handler(b, w[4], w, count);
+      (void)handled;
+      assert(handled);
+      break;
+   }
+
+   default:
+      unreachable("Unhandled opcode");
+   }
+}
+
+static void
+_foreach_decoration_helper(struct vtn_builder *b,
+                           struct vtn_value *base_value,
+                           int parent_member,
+                           struct vtn_value *value,
+                           vtn_decoration_foreach_cb cb, void *data)
+{
+   for (struct vtn_decoration *dec = value->decoration; dec; dec = dec->next) {
+      int member;
+      if (dec->scope == VTN_DEC_DECORATION) {
+         member = parent_member;
+      } else if (dec->scope >= VTN_DEC_STRUCT_MEMBER0) {
+         assert(parent_member == -1);
+         member = dec->scope - VTN_DEC_STRUCT_MEMBER0;
+      } else {
+         /* Not a decoration */
+         continue;
+      }
+
+      if (dec->group) {
+         assert(dec->group->value_type == vtn_value_type_decoration_group);
+         _foreach_decoration_helper(b, base_value, member, dec->group,
+                                    cb, data);
+      } else {
+         cb(b, base_value, member, dec, data);
+      }
+   }
+}
+
+/** Iterates (recursively if needed) over all of the decorations on a value
+ *
+ * This function iterates over all of the decorations applied to a given
+ * value.  If it encounters a decoration group, it recurses into the group
+ * and iterates over all of those decorations as well.
+ */
+void
+vtn_foreach_decoration(struct vtn_builder *b, struct vtn_value *value,
+                       vtn_decoration_foreach_cb cb, void *data)
+{
+   _foreach_decoration_helper(b, value, -1, value, cb, data);
+}
+
+void
+vtn_foreach_execution_mode(struct vtn_builder *b, struct vtn_value *value,
+                           vtn_execution_mode_foreach_cb cb, void *data)
+{
+   for (struct vtn_decoration *dec = value->decoration; dec; dec = dec->next) {
+      if (dec->scope != VTN_DEC_EXECUTION_MODE)
+         continue;
+
+      assert(dec->group == NULL);
+      cb(b, value, dec, data);
+   }
+}
+
+static void
+vtn_handle_decoration(struct vtn_builder *b, SpvOp opcode,
+                      const uint32_t *w, unsigned count)
+{
+   const uint32_t *w_end = w + count;
+   const uint32_t target = w[1];
+   w += 2;
+
+   switch (opcode) {
+   case SpvOpDecorationGroup:
+      vtn_push_value(b, target, vtn_value_type_decoration_group);
+      break;
+
+   case SpvOpDecorate:
+   case SpvOpMemberDecorate:
+   case SpvOpExecutionMode: {
+      struct vtn_value *val = &b->values[target];
+
+      struct vtn_decoration *dec = rzalloc(b, struct vtn_decoration);
+      switch (opcode) {
+      case SpvOpDecorate:
+         dec->scope = VTN_DEC_DECORATION;
+         break;
+      case SpvOpMemberDecorate:
+         dec->scope = VTN_DEC_STRUCT_MEMBER0 + *(w++);
+         break;
+      case SpvOpExecutionMode:
+         dec->scope = VTN_DEC_EXECUTION_MODE;
+         break;
+      default:
+         unreachable("Invalid decoration opcode");
+      }
+      dec->decoration = *(w++);
+      dec->literals = w;
+
+      /* Link into the list */
+      dec->next = val->decoration;
+      val->decoration = dec;
+      break;
+   }
+
+   case SpvOpGroupMemberDecorate:
+   case SpvOpGroupDecorate: {
+      struct vtn_value *group =
+         vtn_value(b, target, vtn_value_type_decoration_group);
+
+      for (; w < w_end; w++) {
+         struct vtn_value *val = vtn_untyped_value(b, *w);
+         struct vtn_decoration *dec = rzalloc(b, struct vtn_decoration);
+
+         dec->group = group;
+         if (opcode == SpvOpGroupDecorate) {
+            dec->scope = VTN_DEC_DECORATION;
+         } else {
+            dec->scope = VTN_DEC_STRUCT_MEMBER0 + *(w++);
+         }
+
+         /* Link into the list */
+         dec->next = val->decoration;
+         val->decoration = dec;
+      }
+      break;
+   }
+
+   default:
+      unreachable("Unhandled opcode");
+   }
+}
+
+struct member_decoration_ctx {
+   struct glsl_struct_field *fields;
+   struct vtn_type *type;
+};
+
+/* does a shallow copy of a vtn_type */
+
+static struct vtn_type *
+vtn_type_copy(struct vtn_builder *b, struct vtn_type *src)
+{
+   struct vtn_type *dest = ralloc(b, struct vtn_type);
+   dest->type = src->type;
+   dest->is_builtin = src->is_builtin;
+   if (src->is_builtin)
+      dest->builtin = src->builtin;
+
+   if (!glsl_type_is_vector_or_scalar(src->type)) {
+      switch (glsl_get_base_type(src->type)) {
+      case GLSL_TYPE_ARRAY:
+         dest->array_element = src->array_element;
+         dest->stride = src->stride;
+         break;
+
+      case GLSL_TYPE_INT:
+      case GLSL_TYPE_UINT:
+      case GLSL_TYPE_BOOL:
+      case GLSL_TYPE_FLOAT:
+      case GLSL_TYPE_DOUBLE:
+         /* matrices */
+         dest->row_major = src->row_major;
+         dest->stride = src->stride;
+         break;
+
+      case GLSL_TYPE_STRUCT: {
+         unsigned elems = glsl_get_length(src->type);
+
+         dest->members = ralloc_array(b, struct vtn_type *, elems);
+         memcpy(dest->members, src->members, elems * sizeof(struct vtn_type *));
+
+         dest->offsets = ralloc_array(b, unsigned, elems);
+         memcpy(dest->offsets, src->offsets, elems * sizeof(unsigned));
+         break;
+      }
+
+      default:
+         unreachable("unhandled type");
+      }
+   }
+
+   return dest;
+}
+
+static struct vtn_type *
+mutable_matrix_member(struct vtn_builder *b, struct vtn_type *type, int member)
+{
+   type->members[member] = vtn_type_copy(b, type->members[member]);
+   type = type->members[member];
+
+   /* We may have an array of matrices.... Oh, joy! */
+   while (glsl_type_is_array(type->type)) {
+      type->array_element = vtn_type_copy(b, type->array_element);
+      type = type->array_element;
+   }
+
+   assert(glsl_type_is_matrix(type->type));
+
+   return type;
+}
+
+static void
+struct_member_decoration_cb(struct vtn_builder *b,
+                            struct vtn_value *val, int member,
+                            const struct vtn_decoration *dec, void *void_ctx)
+{
+   struct member_decoration_ctx *ctx = void_ctx;
+
+   if (member < 0)
+      return;
+
+   switch (dec->decoration) {
+   case SpvDecorationRelaxedPrecision:
+      break; /* FIXME: Do nothing with this for now. */
+   case SpvDecorationNoPerspective:
+      ctx->fields[member].interpolation = INTERP_QUALIFIER_NOPERSPECTIVE;
+      break;
+   case SpvDecorationFlat:
+      ctx->fields[member].interpolation = INTERP_QUALIFIER_FLAT;
+      break;
+   case SpvDecorationCentroid:
+      ctx->fields[member].centroid = true;
+      break;
+   case SpvDecorationSample:
+      ctx->fields[member].sample = true;
+      break;
+   case SpvDecorationLocation:
+      ctx->fields[member].location = dec->literals[0];
+      break;
+   case SpvDecorationBuiltIn:
+      ctx->type->members[member] = vtn_type_copy(b, ctx->type->members[member]);
+      ctx->type->members[member]->is_builtin = true;
+      ctx->type->members[member]->builtin = dec->literals[0];
+      ctx->type->builtin_block = true;
+      break;
+   case SpvDecorationOffset:
+      ctx->type->offsets[member] = dec->literals[0];
+      break;
+   case SpvDecorationMatrixStride:
+      mutable_matrix_member(b, ctx->type, member)->stride = dec->literals[0];
+      break;
+   case SpvDecorationColMajor:
+      break; /* Nothing to do here.  Column-major is the default. */
+   case SpvDecorationRowMajor:
+      mutable_matrix_member(b, ctx->type, member)->row_major = true;
+      break;
+   default:
+      unreachable("Unhandled member decoration");
+   }
+}
+
+static void
+type_decoration_cb(struct vtn_builder *b,
+                   struct vtn_value *val, int member,
+                    const struct vtn_decoration *dec, void *ctx)
+{
+   struct vtn_type *type = val->type;
+
+   if (member != -1)
+      return;
+
+   switch (dec->decoration) {
+   case SpvDecorationArrayStride:
+      type->stride = dec->literals[0];
+      break;
+   case SpvDecorationBlock:
+      type->block = true;
+      break;
+   case SpvDecorationBufferBlock:
+      type->buffer_block = true;
+      break;
+   case SpvDecorationGLSLShared:
+   case SpvDecorationGLSLPacked:
+      /* Ignore these, since we get explicit offsets anyways */
+      break;
+
+   case SpvDecorationStream:
+      assert(dec->literals[0] == 0);
+      break;
+
+   default:
+      unreachable("Unhandled type decoration");
+   }
+}
+
+static unsigned
+translate_image_format(SpvImageFormat format)
+{
+   switch (format) {
+   case SpvImageFormatUnknown:      return 0;      /* GL_NONE */
+   case SpvImageFormatRgba32f:      return 0x8814; /* GL_RGBA32F */
+   case SpvImageFormatRgba16f:      return 0x881A; /* GL_RGBA16F */
+   case SpvImageFormatR32f:         return 0x822E; /* GL_R32F */
+   case SpvImageFormatRgba8:        return 0x8058; /* GL_RGBA8 */
+   case SpvImageFormatRgba8Snorm:   return 0x8F97; /* GL_RGBA8_SNORM */
+   case SpvImageFormatRg32f:        return 0x8230; /* GL_RG32F */
+   case SpvImageFormatRg16f:        return 0x822F; /* GL_RG16F */
+   case SpvImageFormatR11fG11fB10f: return 0x8C3A; /* GL_R11F_G11F_B10F */
+   case SpvImageFormatR16f:         return 0x822D; /* GL_R16F */
+   case SpvImageFormatRgba16:       return 0x805B; /* GL_RGBA16 */
+   case SpvImageFormatRgb10A2:      return 0x8059; /* GL_RGB10_A2 */
+   case SpvImageFormatRg16:         return 0x822C; /* GL_RG16 */
+   case SpvImageFormatRg8:          return 0x822B; /* GL_RG8 */
+   case SpvImageFormatR16:          return 0x822A; /* GL_R16 */
+   case SpvImageFormatR8:           return 0x8229; /* GL_R8 */
+   case SpvImageFormatRgba16Snorm:  return 0x8F9B; /* GL_RGBA16_SNORM */
+   case SpvImageFormatRg16Snorm:    return 0x8F99; /* GL_RG16_SNORM */
+   case SpvImageFormatRg8Snorm:     return 0x8F95; /* GL_RG8_SNORM */
+   case SpvImageFormatR16Snorm:     return 0x8F98; /* GL_R16_SNORM */
+   case SpvImageFormatR8Snorm:      return 0x8F94; /* GL_R8_SNORM */
+   case SpvImageFormatRgba32i:      return 0x8D82; /* GL_RGBA32I */
+   case SpvImageFormatRgba16i:      return 0x8D88; /* GL_RGBA16I */
+   case SpvImageFormatRgba8i:       return 0x8D8E; /* GL_RGBA8I */
+   case SpvImageFormatR32i:         return 0x8235; /* GL_R32I */
+   case SpvImageFormatRg32i:        return 0x823B; /* GL_RG32I */
+   case SpvImageFormatRg16i:        return 0x8239; /* GL_RG16I */
+   case SpvImageFormatRg8i:         return 0x8237; /* GL_RG8I */
+   case SpvImageFormatR16i:         return 0x8233; /* GL_R16I */
+   case SpvImageFormatR8i:          return 0x8231; /* GL_R8I */
+   case SpvImageFormatRgba32ui:     return 0x8D70; /* GL_RGBA32UI */
+   case SpvImageFormatRgba16ui:     return 0x8D76; /* GL_RGBA16UI */
+   case SpvImageFormatRgba8ui:      return 0x8D7C; /* GL_RGBA8UI */
+   case SpvImageFormatR32ui:        return 0x8236; /* GL_R32UI */
+   case SpvImageFormatRgb10a2ui:    return 0x906F; /* GL_RGB10_A2UI */
+   case SpvImageFormatRg32ui:       return 0x823C; /* GL_RG32UI */
+   case SpvImageFormatRg16ui:       return 0x823A; /* GL_RG16UI */
+   case SpvImageFormatRg8ui:        return 0x8238; /* GL_RG8UI */
+   case SpvImageFormatR16ui:        return 0x823A; /* GL_RG16UI */
+   case SpvImageFormatR8ui:         return 0x8232; /* GL_R8UI */
+   default:
+      assert(!"Invalid image format");
+      return 0;
+   }
+}
+
+static void
+vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
+                const uint32_t *w, unsigned count)
+{
+   struct vtn_value *val = vtn_push_value(b, w[1], vtn_value_type_type);
+
+   val->type = rzalloc(b, struct vtn_type);
+   val->type->is_builtin = false;
+
+   switch (opcode) {
+   case SpvOpTypeVoid:
+      val->type->type = glsl_void_type();
+      break;
+   case SpvOpTypeBool:
+      val->type->type = glsl_bool_type();
+      break;
+   case SpvOpTypeInt:
+      val->type->type = glsl_int_type();
+      break;
+   case SpvOpTypeFloat:
+      val->type->type = glsl_float_type();
+      break;
+
+   case SpvOpTypeVector: {
+      struct vtn_type *base = vtn_value(b, w[2], vtn_value_type_type)->type;
+      unsigned elems = w[3];
+
+      assert(glsl_type_is_scalar(base->type));
+      val->type->type = glsl_vector_type(glsl_get_base_type(base->type), elems);
+
+      /* Vectors implicitly have sizeof(base_type) stride.  For now, this
+       * is always 4 bytes.  This will have to change if we want to start
+       * supporting doubles or half-floats.
+       */
+      val->type->stride = 4;
+      val->type->array_element = base;
+      break;
+   }
+
+   case SpvOpTypeMatrix: {
+      struct vtn_type *base = vtn_value(b, w[2], vtn_value_type_type)->type;
+      unsigned columns = w[3];
+
+      assert(glsl_type_is_vector(base->type));
+      val->type->type = glsl_matrix_type(glsl_get_base_type(base->type),
+                                         glsl_get_vector_elements(base->type),
+                                         columns);
+      assert(!glsl_type_is_error(val->type->type));
+      val->type->array_element = base;
+      val->type->row_major = false;
+      val->type->stride = 0;
+      break;
+   }
+
+   case SpvOpTypeRuntimeArray:
+   case SpvOpTypeArray: {
+      struct vtn_type *array_element =
+         vtn_value(b, w[2], vtn_value_type_type)->type;
+
+      unsigned length;
+      if (opcode == SpvOpTypeRuntimeArray) {
+         /* A length of 0 is used to denote unsized arrays */
+         length = 0;
+      } else {
+         length =
+            vtn_value(b, w[3], vtn_value_type_constant)->constant->value.u[0];
+      }
+
+      val->type->type = glsl_array_type(array_element->type, length);
+      val->type->array_element = array_element;
+      val->type->stride = 0;
+      break;
+   }
+
+   case SpvOpTypeStruct: {
+      unsigned num_fields = count - 2;
+      val->type->members = ralloc_array(b, struct vtn_type *, num_fields);
+      val->type->offsets = ralloc_array(b, unsigned, num_fields);
+
+      NIR_VLA(struct glsl_struct_field, fields, count);
+      for (unsigned i = 0; i < num_fields; i++) {
+         val->type->members[i] =
+            vtn_value(b, w[i + 2], vtn_value_type_type)->type;
+         fields[i] = (struct glsl_struct_field) {
+            .type = val->type->members[i]->type,
+            .name = ralloc_asprintf(b, "field%d", i),
+            .location = -1,
+         };
+      }
+
+      struct member_decoration_ctx ctx = {
+         .fields = fields,
+         .type = val->type
+      };
+
+      vtn_foreach_decoration(b, val, struct_member_decoration_cb, &ctx);
+
+      const char *name = val->name ? val->name : "struct";
+
+      val->type->type = glsl_struct_type(fields, num_fields, name);
+      break;
+   }
+
+   case SpvOpTypeFunction: {
+      const struct glsl_type *return_type =
+         vtn_value(b, w[2], vtn_value_type_type)->type->type;
+      NIR_VLA(struct glsl_function_param, params, count - 3);
+      for (unsigned i = 0; i < count - 3; i++) {
+         params[i].type = vtn_value(b, w[i + 3], vtn_value_type_type)->type->type;
+
+         /* FIXME: */
+         params[i].in = true;
+         params[i].out = true;
+      }
+      val->type->type = glsl_function_type(return_type, params, count - 3);
+      break;
+   }
+
+   case SpvOpTypePointer:
+      /* FIXME:  For now, we'll just do the really lame thing and return
+       * the same type.  The validator should ensure that the proper number
+       * of dereferences happen
+       */
+      val->type = vtn_value(b, w[3], vtn_value_type_type)->type;
+      break;
+
+   case SpvOpTypeImage: {
+      const struct glsl_type *sampled_type =
+         vtn_value(b, w[2], vtn_value_type_type)->type->type;
+
+      assert(glsl_type_is_vector_or_scalar(sampled_type));
+
+      enum glsl_sampler_dim dim;
+      switch ((SpvDim)w[3]) {
+      case SpvDim1D:       dim = GLSL_SAMPLER_DIM_1D;    break;
+      case SpvDim2D:       dim = GLSL_SAMPLER_DIM_2D;    break;
+      case SpvDim3D:       dim = GLSL_SAMPLER_DIM_3D;    break;
+      case SpvDimCube:     dim = GLSL_SAMPLER_DIM_CUBE;  break;
+      case SpvDimRect:     dim = GLSL_SAMPLER_DIM_RECT;  break;
+      case SpvDimBuffer:   dim = GLSL_SAMPLER_DIM_BUF;   break;
+      default:
+         unreachable("Invalid SPIR-V Sampler dimension");
+      }
+
+      bool is_shadow = w[4];
+      bool is_array = w[5];
+      bool multisampled = w[6];
+      unsigned sampled = w[7];
+      SpvImageFormat format = w[8];
+
+      if (count > 9)
+         val->type->access_qualifier = w[9];
+      else
+         val->type->access_qualifier = SpvAccessQualifierReadWrite;
+
+      assert(!multisampled && "FIXME: Handl multi-sampled textures");
+
+      val->type->image_format = translate_image_format(format);
+
+      if (sampled == 1) {
+         val->type->type = glsl_sampler_type(dim, is_shadow, is_array,
+                                             glsl_get_base_type(sampled_type));
+      } else if (sampled == 2) {
+         assert(format);
+         assert(!is_shadow);
+         val->type->type = glsl_image_type(dim, is_array,
+                                           glsl_get_base_type(sampled_type));
+      } else {
+         assert(!"We need to know if the image will be sampled");
+      }
+      break;
+   }
+
+   case SpvOpTypeSampledImage:
+      val->type = vtn_value(b, w[2], vtn_value_type_type)->type;
+      break;
+
+   case SpvOpTypeSampler:
+      /* The actual sampler type here doesn't really matter.  It gets
+       * thrown away the moment you combine it with an image.  What really
+       * matters is that it's a sampler type as opposed to an integer type
+       * so the backend knows what to do.
+       *
+       * TODO: Eventually we should consider adding a "bare sampler" type
+       * to glsl_types.
+       */
+      val->type->type = glsl_sampler_type(GLSL_SAMPLER_DIM_2D, false, false,
+                                          GLSL_TYPE_FLOAT);
+      break;
+
+   case SpvOpTypeOpaque:
+   case SpvOpTypeEvent:
+   case SpvOpTypeDeviceEvent:
+   case SpvOpTypeReserveId:
+   case SpvOpTypeQueue:
+   case SpvOpTypePipe:
+   default:
+      unreachable("Unhandled opcode");
+   }
+
+   vtn_foreach_decoration(b, val, type_decoration_cb, NULL);
+}
+
+static nir_constant *
+vtn_null_constant(struct vtn_builder *b, const struct glsl_type *type)
+{
+   nir_constant *c = rzalloc(b, nir_constant);
+
+   switch (glsl_get_base_type(type)) {
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+      /* Nothing to do here.  It's already initialized to zero */
+      break;
+
+   case GLSL_TYPE_ARRAY:
+      assert(glsl_get_length(type) > 0);
+      c->num_elements = glsl_get_length(type);
+      c->elements = ralloc_array(b, nir_constant *, c->num_elements);
+
+      c->elements[0] = vtn_null_constant(b, glsl_get_array_element(type));
+      for (unsigned i = 1; i < c->num_elements; i++)
+         c->elements[i] = c->elements[0];
+      break;
+
+   case GLSL_TYPE_STRUCT:
+      c->num_elements = glsl_get_length(type);
+      c->elements = ralloc_array(b, nir_constant *, c->num_elements);
+
+      for (unsigned i = 0; i < c->num_elements; i++) {
+         c->elements[i] = vtn_null_constant(b, glsl_get_struct_field(type, i));
+      }
+      break;
+
+   default:
+      unreachable("Invalid type for null constant");
+   }
+
+   return c;
+}
+
+static void
+spec_constant_deocoration_cb(struct vtn_builder *b, struct vtn_value *v,
+                             int member, const struct vtn_decoration *dec,
+                             void *data)
+{
+   assert(member == -1);
+   if (dec->decoration != SpvDecorationSpecId)
+      return;
+
+   uint32_t *const_value = data;
+
+   for (unsigned i = 0; i < b->num_specializations; i++) {
+      if (b->specializations[i].id == dec->literals[0]) {
+         *const_value = b->specializations[i].data;
+         return;
+      }
+   }
+}
+
+static uint32_t
+get_specialization(struct vtn_builder *b, struct vtn_value *val,
+                   uint32_t const_value)
+{
+   vtn_foreach_decoration(b, val, spec_constant_deocoration_cb, &const_value);
+   return const_value;
+}
+
+static void
+vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
+                    const uint32_t *w, unsigned count)
+{
+   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_constant);
+   val->const_type = vtn_value(b, w[1], vtn_value_type_type)->type->type;
+   val->constant = rzalloc(b, nir_constant);
+   switch (opcode) {
+   case SpvOpConstantTrue:
+      assert(val->const_type == glsl_bool_type());
+      val->constant->value.u[0] = NIR_TRUE;
+      break;
+   case SpvOpConstantFalse:
+      assert(val->const_type == glsl_bool_type());
+      val->constant->value.u[0] = NIR_FALSE;
+      break;
+
+   case SpvOpSpecConstantTrue:
+   case SpvOpSpecConstantFalse: {
+      assert(val->const_type == glsl_bool_type());
+      uint32_t int_val =
+         get_specialization(b, val, (opcode == SpvOpSpecConstantTrue));
+      val->constant->value.u[0] = int_val ? NIR_TRUE : NIR_FALSE;
+      break;
+   }
+
+   case SpvOpConstant:
+      assert(glsl_type_is_scalar(val->const_type));
+      val->constant->value.u[0] = w[3];
+      break;
+   case SpvOpSpecConstant:
+      assert(glsl_type_is_scalar(val->const_type));
+      val->constant->value.u[0] = get_specialization(b, val, w[3]);
+      break;
+   case SpvOpSpecConstantComposite:
+   case SpvOpConstantComposite: {
+      unsigned elem_count = count - 3;
+      nir_constant **elems = ralloc_array(b, nir_constant *, elem_count);
+      for (unsigned i = 0; i < elem_count; i++)
+         elems[i] = vtn_value(b, w[i + 3], vtn_value_type_constant)->constant;
+
+      switch (glsl_get_base_type(val->const_type)) {
+      case GLSL_TYPE_UINT:
+      case GLSL_TYPE_INT:
+      case GLSL_TYPE_FLOAT:
+      case GLSL_TYPE_BOOL:
+         if (glsl_type_is_matrix(val->const_type)) {
+            unsigned rows = glsl_get_vector_elements(val->const_type);
+            assert(glsl_get_matrix_columns(val->const_type) == elem_count);
+            for (unsigned i = 0; i < elem_count; i++)
+               for (unsigned j = 0; j < rows; j++)
+                  val->constant->value.u[rows * i + j] = elems[i]->value.u[j];
+         } else {
+            assert(glsl_type_is_vector(val->const_type));
+            assert(glsl_get_vector_elements(val->const_type) == elem_count);
+            for (unsigned i = 0; i < elem_count; i++)
+               val->constant->value.u[i] = elems[i]->value.u[0];
+         }
+         ralloc_free(elems);
+         break;
+
+      case GLSL_TYPE_STRUCT:
+      case GLSL_TYPE_ARRAY:
+         ralloc_steal(val->constant, elems);
+         val->constant->num_elements = elem_count;
+         val->constant->elements = elems;
+         break;
+
+      default:
+         unreachable("Unsupported type for constants");
+      }
+      break;
+   }
+
+   case SpvOpSpecConstantOp: {
+      SpvOp opcode = get_specialization(b, val, w[3]);
+      switch (opcode) {
+      case SpvOpVectorShuffle: {
+         struct vtn_value *v0 = vtn_value(b, w[4], vtn_value_type_constant);
+         struct vtn_value *v1 = vtn_value(b, w[5], vtn_value_type_constant);
+         unsigned len0 = glsl_get_vector_elements(v0->const_type);
+         unsigned len1 = glsl_get_vector_elements(v1->const_type);
+
+         uint32_t u[8];
+         for (unsigned i = 0; i < len0; i++)
+            u[i] = v0->constant->value.u[i];
+         for (unsigned i = 0; i < len1; i++)
+            u[len0 + i] = v1->constant->value.u[i];
+
+         for (unsigned i = 0; i < count - 6; i++) {
+            uint32_t comp = w[i + 6];
+            if (comp == (uint32_t)-1) {
+               val->constant->value.u[i] = 0xdeadbeef;
+            } else {
+               val->constant->value.u[i] = u[comp];
+            }
+         }
+         return;
+      }
+
+      case SpvOpCompositeExtract:
+      case SpvOpCompositeInsert: {
+         struct vtn_value *comp;
+         unsigned deref_start;
+         struct nir_constant **c;
+         if (opcode == SpvOpCompositeExtract) {
+            comp = vtn_value(b, w[4], vtn_value_type_constant);
+            deref_start = 5;
+            c = &comp->constant;
+         } else {
+            comp = vtn_value(b, w[5], vtn_value_type_constant);
+            deref_start = 6;
+            val->constant = nir_constant_clone(comp->constant,
+                                               (nir_variable *)b);
+            c = &val->constant;
+         }
+
+         int elem = -1;
+         const struct glsl_type *type = comp->const_type;
+         for (unsigned i = deref_start; i < count; i++) {
+            switch (glsl_get_base_type(type)) {
+            case GLSL_TYPE_UINT:
+            case GLSL_TYPE_INT:
+            case GLSL_TYPE_FLOAT:
+            case GLSL_TYPE_BOOL:
+               /* If we hit this granularity, we're picking off an element */
+               if (elem < 0)
+                  elem = 0;
+
+               if (glsl_type_is_matrix(type)) {
+                  elem += w[i] * glsl_get_vector_elements(type);
+                  type = glsl_get_column_type(type);
+               } else {
+                  assert(glsl_type_is_vector(type));
+                  elem += w[i];
+                  type = glsl_scalar_type(glsl_get_base_type(type));
+               }
+               continue;
+
+            case GLSL_TYPE_ARRAY:
+               c = &(*c)->elements[w[i]];
+               type = glsl_get_array_element(type);
+               continue;
+
+            case GLSL_TYPE_STRUCT:
+               c = &(*c)->elements[w[i]];
+               type = glsl_get_struct_field(type, w[i]);
+               continue;
+
+            default:
+               unreachable("Invalid constant type");
+            }
+         }
+
+         if (opcode == SpvOpCompositeExtract) {
+            if (elem == -1) {
+               val->constant = *c;
+            } else {
+               unsigned num_components = glsl_get_vector_elements(type);
+               for (unsigned i = 0; i < num_components; i++)
+                  val->constant->value.u[i] = (*c)->value.u[elem + i];
+            }
+         } else {
+            struct vtn_value *insert =
+               vtn_value(b, w[4], vtn_value_type_constant);
+            assert(insert->const_type == type);
+            if (elem == -1) {
+               *c = insert->constant;
+            } else {
+               unsigned num_components = glsl_get_vector_elements(type);
+               for (unsigned i = 0; i < num_components; i++)
+                  (*c)->value.u[elem + i] = insert->constant->value.u[i];
+            }
+         }
+         return;
+      }
+
+      default: {
+         bool swap;
+         nir_op op = vtn_nir_alu_op_for_spirv_opcode(opcode, &swap);
+
+         unsigned num_components = glsl_get_vector_elements(val->const_type);
+
+         nir_const_value src[3];
+         assert(count <= 7);
+         for (unsigned i = 0; i < count - 4; i++) {
+            nir_constant *c =
+               vtn_value(b, w[4 + i], vtn_value_type_constant)->constant;
+
+            unsigned j = swap ? 1 - i : i;
+            for (unsigned k = 0; k < num_components; k++)
+               src[j].u[k] = c->value.u[k];
+         }
+
+         nir_const_value res = nir_eval_const_opcode(op, num_components, src);
+
+         for (unsigned k = 0; k < num_components; k++)
+            val->constant->value.u[k] = res.u[k];
+
+         return;
+      } /* default */
+      }
+   }
+
+   case SpvOpConstantNull:
+      val->constant = vtn_null_constant(b, val->const_type);
+      break;
+
+   case SpvOpConstantSampler:
+      assert(!"OpConstantSampler requires Kernel Capability");
+      break;
+
+   default:
+      unreachable("Unhandled opcode");
+   }
+}
+
+static void
+set_mode_system_value(nir_variable_mode *mode)
+{
+   assert(*mode == nir_var_system_value || *mode == nir_var_shader_in);
+   *mode = nir_var_system_value;
+}
+
+static void
+vtn_get_builtin_location(struct vtn_builder *b,
+                         SpvBuiltIn builtin, int *location,
+                         nir_variable_mode *mode)
+{
+   switch (builtin) {
+   case SpvBuiltInPosition:
+      *location = VARYING_SLOT_POS;
+      break;
+   case SpvBuiltInPointSize:
+      *location = VARYING_SLOT_PSIZ;
+      break;
+   case SpvBuiltInClipDistance:
+      *location = VARYING_SLOT_CLIP_DIST0; /* XXX CLIP_DIST1? */
+      break;
+   case SpvBuiltInCullDistance:
+      /* XXX figure this out */
+      unreachable("unhandled builtin");
+   case SpvBuiltInVertexIndex:
+      *location = SYSTEM_VALUE_VERTEX_ID;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInVertexId:
+      /* Vulkan defines VertexID to be zero-based and reserves the new
+       * builtin keyword VertexIndex to indicate the non-zero-based value.
+       */
+      *location = SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInInstanceIndex:
+      /* XXX */
+   case SpvBuiltInInstanceId:
+      *location = SYSTEM_VALUE_INSTANCE_ID;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInPrimitiveId:
+      *location = VARYING_SLOT_PRIMITIVE_ID;
+      *mode = nir_var_shader_out;
+      break;
+   case SpvBuiltInInvocationId:
+      *location = SYSTEM_VALUE_INVOCATION_ID;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInLayer:
+      *location = VARYING_SLOT_LAYER;
+      *mode = nir_var_shader_out;
+      break;
+   case SpvBuiltInViewportIndex:
+      *location = VARYING_SLOT_VIEWPORT;
+      if (b->shader->stage == MESA_SHADER_GEOMETRY)
+         *mode = nir_var_shader_out;
+      else if (b->shader->stage == MESA_SHADER_FRAGMENT)
+         *mode = nir_var_shader_in;
+      else
+         unreachable("invalid stage for SpvBuiltInViewportIndex");
+      break;
+   case SpvBuiltInTessLevelOuter:
+   case SpvBuiltInTessLevelInner:
+   case SpvBuiltInTessCoord:
+   case SpvBuiltInPatchVertices:
+      unreachable("no tessellation support");
+   case SpvBuiltInFragCoord:
+      *location = VARYING_SLOT_POS;
+      assert(*mode == nir_var_shader_in);
+      break;
+   case SpvBuiltInPointCoord:
+      *location = VARYING_SLOT_PNTC;
+      assert(*mode == nir_var_shader_in);
+      break;
+   case SpvBuiltInFrontFacing:
+      *location = VARYING_SLOT_FACE;
+      assert(*mode == nir_var_shader_in);
+      break;
+   case SpvBuiltInSampleId:
+      *location = SYSTEM_VALUE_SAMPLE_ID;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInSamplePosition:
+      *location = SYSTEM_VALUE_SAMPLE_POS;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInSampleMask:
+      *location = SYSTEM_VALUE_SAMPLE_MASK_IN; /* XXX out? */
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInFragDepth:
+      *location = FRAG_RESULT_DEPTH;
+      assert(*mode == nir_var_shader_out);
+      break;
+   case SpvBuiltInNumWorkgroups:
+      *location = SYSTEM_VALUE_NUM_WORK_GROUPS;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInWorkgroupSize:
+      /* This should already be handled */
+      unreachable("unsupported builtin");
+      break;
+   case SpvBuiltInWorkgroupId:
+      *location = SYSTEM_VALUE_WORK_GROUP_ID;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInLocalInvocationId:
+      *location = SYSTEM_VALUE_LOCAL_INVOCATION_ID;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInLocalInvocationIndex:
+      *location = SYSTEM_VALUE_LOCAL_INVOCATION_INDEX;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInGlobalInvocationId:
+      *location = SYSTEM_VALUE_GLOBAL_INVOCATION_ID;
+      set_mode_system_value(mode);
+      break;
+   case SpvBuiltInHelperInvocation:
+   default:
+      unreachable("unsupported builtin");
+   }
+}
+
+static void
+var_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member,
+                  const struct vtn_decoration *dec, void *void_var)
+{
+   assert(val->value_type == vtn_value_type_deref);
+   assert(val->deref->deref.child == NULL);
+   assert(val->deref->var == void_var);
+
+   nir_variable *var = void_var;
+   switch (dec->decoration) {
+   case SpvDecorationRelaxedPrecision:
+      break; /* FIXME: Do nothing with this for now. */
+   case SpvDecorationNoPerspective:
+      var->data.interpolation = INTERP_QUALIFIER_NOPERSPECTIVE;
+      break;
+   case SpvDecorationFlat:
+      var->data.interpolation = INTERP_QUALIFIER_FLAT;
+      break;
+   case SpvDecorationCentroid:
+      var->data.centroid = true;
+      break;
+   case SpvDecorationSample:
+      var->data.sample = true;
+      break;
+   case SpvDecorationInvariant:
+      var->data.invariant = true;
+      break;
+   case SpvDecorationConstant:
+      assert(var->constant_initializer != NULL);
+      var->data.read_only = true;
+      break;
+   case SpvDecorationNonWritable:
+      var->data.read_only = true;
+      break;
+   case SpvDecorationLocation:
+      var->data.location = dec->literals[0];
+      break;
+   case SpvDecorationComponent:
+      var->data.location_frac = dec->literals[0];
+      break;
+   case SpvDecorationIndex:
+      var->data.explicit_index = true;
+      var->data.index = dec->literals[0];
+      break;
+   case SpvDecorationBinding:
+      var->data.explicit_binding = true;
+      var->data.binding = dec->literals[0];
+      break;
+   case SpvDecorationDescriptorSet:
+      var->data.descriptor_set = dec->literals[0];
+      break;
+   case SpvDecorationBuiltIn: {
+      SpvBuiltIn builtin = dec->literals[0];
+
+      if (builtin == SpvBuiltInWorkgroupSize) {
+         /* This shouldn't be a builtin.  It's actually a constant. */
+         var->data.mode = nir_var_global;
+         var->data.read_only = true;
+
+         nir_constant *val = rzalloc(var, nir_constant);
+         val->value.u[0] = b->shader->info.cs.local_size[0];
+         val->value.u[1] = b->shader->info.cs.local_size[1];
+         val->value.u[2] = b->shader->info.cs.local_size[2];
+         var->constant_initializer = val;
+         break;
+      }
+
+      nir_variable_mode mode = var->data.mode;
+      vtn_get_builtin_location(b, builtin, &var->data.location, &mode);
+      var->data.explicit_location = true;
+      var->data.mode = mode;
+      if (mode == nir_var_shader_in || mode == nir_var_system_value)
+         var->data.read_only = true;
+
+      if (builtin == SpvBuiltInFragCoord || builtin == SpvBuiltInSamplePosition)
+         var->data.origin_upper_left = b->origin_upper_left;
+
+      if (mode == nir_var_shader_out)
+         b->builtins[dec->literals[0]].out = var;
+      else
+         b->builtins[dec->literals[0]].in = var;
+      break;
+   }
+   case SpvDecorationRowMajor:
+   case SpvDecorationColMajor:
+   case SpvDecorationGLSLShared:
+   case SpvDecorationPatch:
+   case SpvDecorationRestrict:
+   case SpvDecorationAliased:
+   case SpvDecorationVolatile:
+   case SpvDecorationCoherent:
+   case SpvDecorationNonReadable:
+   case SpvDecorationUniform:
+      /* This is really nice but we have no use for it right now. */
+   case SpvDecorationCPacked:
+   case SpvDecorationSaturatedConversion:
+   case SpvDecorationStream:
+   case SpvDecorationOffset:
+   case SpvDecorationXfbBuffer:
+   case SpvDecorationFuncParamAttr:
+   case SpvDecorationFPRoundingMode:
+   case SpvDecorationFPFastMathMode:
+   case SpvDecorationLinkageAttributes:
+   case SpvDecorationSpecId:
+      break;
+   default:
+      unreachable("Unhandled variable decoration");
+   }
+}
+
+static nir_variable *
+get_builtin_variable(struct vtn_builder *b,
+                     nir_variable_mode mode,
+                     const struct glsl_type *type,
+                     SpvBuiltIn builtin)
+{
+   nir_variable *var;
+   if (mode == nir_var_shader_out)
+      var = b->builtins[builtin].out;
+   else
+      var = b->builtins[builtin].in;
+
+   if (!var) {
+      int location;
+      vtn_get_builtin_location(b, builtin, &location, &mode);
+
+      var = nir_variable_create(b->shader, mode, type, "builtin");
+
+      var->data.location = location;
+      var->data.explicit_location = true;
+
+      if (builtin == SpvBuiltInFragCoord || builtin == SpvBuiltInSamplePosition)
+         var->data.origin_upper_left = b->origin_upper_left;
+
+      if (mode == nir_var_shader_out)
+         b->builtins[builtin].out = var;
+      else
+         b->builtins[builtin].in = var;
+   }
+
+   return var;
+}
+
+static struct vtn_ssa_value *
+_vtn_variable_load(struct vtn_builder *b,
+                   nir_deref_var *src_deref, nir_deref *src_deref_tail)
+{
+   struct vtn_ssa_value *val = rzalloc(b, struct vtn_ssa_value);
+   val->type = src_deref_tail->type;
+
+   /* The deref tail may contain a deref to select a component of a vector (in
+    * other words, it might not be an actual tail) so we have to save it away
+    * here since we overwrite it later.
+    */
+   nir_deref *old_child = src_deref_tail->child;
+
+   if (glsl_type_is_vector_or_scalar(val->type)) {
+      /* Terminate the deref chain in case there is one more link to pick
+       * off a component of the vector.
+       */
+      src_deref_tail->child = NULL;
+
+      nir_intrinsic_instr *load =
+         nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var);
+      load->variables[0] =
+         nir_deref_as_var(nir_copy_deref(load, &src_deref->deref));
+      load->num_components = glsl_get_vector_elements(val->type);
+      nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, NULL);
+
+      nir_builder_instr_insert(&b->nb, &load->instr);
+
+      if (src_deref->var->data.mode == nir_var_uniform &&
+          glsl_get_base_type(val->type) == GLSL_TYPE_BOOL) {
+         /* Uniform boolean loads need to be fixed up since they're defined
+          * to be zero/nonzero rather than NIR_FALSE/NIR_TRUE.
+          */
+         val->def = nir_ine(&b->nb, &load->dest.ssa, nir_imm_int(&b->nb, 0));
+      } else {
+         val->def = &load->dest.ssa;
+      }
+   } else if (glsl_get_base_type(val->type) == GLSL_TYPE_ARRAY ||
+              glsl_type_is_matrix(val->type)) {
+      unsigned elems = glsl_get_length(val->type);
+      val->elems = ralloc_array(b, struct vtn_ssa_value *, elems);
+
+      nir_deref_array *deref = nir_deref_array_create(b);
+      deref->deref_array_type = nir_deref_array_type_direct;
+      deref->deref.type = glsl_get_array_element(val->type);
+      src_deref_tail->child = &deref->deref;
+      for (unsigned i = 0; i < elems; i++) {
+         deref->base_offset = i;
+         val->elems[i] = _vtn_variable_load(b, src_deref, &deref->deref);
+      }
+   } else {
+      assert(glsl_get_base_type(val->type) == GLSL_TYPE_STRUCT);
+      unsigned elems = glsl_get_length(val->type);
+      val->elems = ralloc_array(b, struct vtn_ssa_value *, elems);
+
+      nir_deref_struct *deref = nir_deref_struct_create(b, 0);
+      src_deref_tail->child = &deref->deref;
+      for (unsigned i = 0; i < elems; i++) {
+         deref->index = i;
+         deref->deref.type = glsl_get_struct_field(val->type, i);
+         val->elems[i] = _vtn_variable_load(b, src_deref, &deref->deref);
+      }
+   }
+
+   src_deref_tail->child = old_child;
+
+   return val;
+}
+
+static void
+_vtn_variable_store(struct vtn_builder *b,
+                    nir_deref_var *dest_deref, nir_deref *dest_deref_tail,
+                    struct vtn_ssa_value *src)
+{
+   nir_deref *old_child = dest_deref_tail->child;
+
+   if (glsl_type_is_vector_or_scalar(src->type)) {
+      /* Terminate the deref chain in case there is one more link to pick
+       * off a component of the vector.
+       */
+      dest_deref_tail->child = NULL;
+
+      nir_intrinsic_instr *store =
+         nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
+      store->variables[0] =
+         nir_deref_as_var(nir_copy_deref(store, &dest_deref->deref));
+      store->num_components = glsl_get_vector_elements(src->type);
+      store->const_index[0] = (1 << store->num_components) - 1;
+      store->src[0] = nir_src_for_ssa(src->def);
+
+      nir_builder_instr_insert(&b->nb, &store->instr);
+   } else if (glsl_get_base_type(src->type) == GLSL_TYPE_ARRAY ||
+              glsl_type_is_matrix(src->type)) {
+      unsigned elems = glsl_get_length(src->type);
+
+      nir_deref_array *deref = nir_deref_array_create(b);
+      deref->deref_array_type = nir_deref_array_type_direct;
+      deref->deref.type = glsl_get_array_element(src->type);
+      dest_deref_tail->child = &deref->deref;
+      for (unsigned i = 0; i < elems; i++) {
+         deref->base_offset = i;
+         _vtn_variable_store(b, dest_deref, &deref->deref, src->elems[i]);
+      }
+   } else {
+      assert(glsl_get_base_type(src->type) == GLSL_TYPE_STRUCT);
+      unsigned elems = glsl_get_length(src->type);
+
+      nir_deref_struct *deref = nir_deref_struct_create(b, 0);
+      dest_deref_tail->child = &deref->deref;
+      for (unsigned i = 0; i < elems; i++) {
+         deref->index = i;
+         deref->deref.type = glsl_get_struct_field(src->type, i);
+         _vtn_variable_store(b, dest_deref, &deref->deref, src->elems[i]);
+      }
+   }
+
+   dest_deref_tail->child = old_child;
+}
+
+static nir_ssa_def *
+deref_array_offset(struct vtn_builder *b, nir_deref *deref)
+{
+   assert(deref->deref_type == nir_deref_type_array);
+   nir_deref_array *deref_array = nir_deref_as_array(deref);
+   nir_ssa_def *offset = nir_imm_int(&b->nb, deref_array->base_offset);
+
+   if (deref_array->deref_array_type == nir_deref_array_type_indirect)
+      offset = nir_iadd(&b->nb, offset, deref_array->indirect.ssa);
+
+   return offset;
+}
+
+static nir_ssa_def *
+get_vulkan_resource_index(struct vtn_builder *b,
+                          nir_deref **deref, struct vtn_type **type)
+{
+   assert((*deref)->deref_type == nir_deref_type_var);
+   nir_variable *var = nir_deref_as_var(*deref)->var;
+
+   assert(var->interface_type && "variable is a block");
+
+   nir_ssa_def *array_index;
+   if ((*deref)->child && (*deref)->child->deref_type == nir_deref_type_array) {
+      *deref = (*deref)->child;
+      *type = (*type)->array_element;
+      array_index = deref_array_offset(b, *deref);
+   } else {
+      array_index = nir_imm_int(&b->nb, 0);
+   }
+
+   nir_intrinsic_instr *instr =
+      nir_intrinsic_instr_create(b->nb.shader,
+                                 nir_intrinsic_vulkan_resource_index);
+   instr->src[0] = nir_src_for_ssa(array_index);
+   instr->const_index[0] = var->data.descriptor_set;
+   instr->const_index[1] = var->data.binding;
+   instr->const_index[2] = var->data.mode;
+
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+   nir_builder_instr_insert(&b->nb, &instr->instr);
+
+   return &instr->dest.ssa;
+}
+
+static void
+_vtn_load_store_tail(struct vtn_builder *b, nir_intrinsic_op op, bool load,
+                     nir_ssa_def *index, nir_ssa_def *offset,
+                     struct vtn_ssa_value **inout, const struct glsl_type *type)
+{
+   nir_intrinsic_instr *instr = nir_intrinsic_instr_create(b->nb.shader, op);
+   instr->num_components = glsl_get_vector_elements(type);
+
+   int src = 0;
+   if (!load) {
+      instr->const_index[0] = (1 << instr->num_components) - 1; /* write mask */
+      instr->src[src++] = nir_src_for_ssa((*inout)->def);
+   }
+
+   if (index)
+      instr->src[src++] = nir_src_for_ssa(index);
+
+   instr->src[src++] = nir_src_for_ssa(offset);
+
+   if (load) {
+      nir_ssa_dest_init(&instr->instr, &instr->dest,
+                        instr->num_components, NULL);
+      (*inout)->def = &instr->dest.ssa;
+   }
+
+   nir_builder_instr_insert(&b->nb, &instr->instr);
+
+   if (load && glsl_get_base_type(type) == GLSL_TYPE_BOOL)
+      (*inout)->def = nir_ine(&b->nb, (*inout)->def, nir_imm_int(&b->nb, 0));
+}
+
+static void
+_vtn_block_load_store(struct vtn_builder *b, nir_intrinsic_op op, bool load,
+                      nir_ssa_def *index, nir_ssa_def *offset, nir_deref *deref,
+                      struct vtn_type *type, struct vtn_ssa_value **inout)
+{
+   if (load && deref == NULL && *inout == NULL)
+      *inout = vtn_create_ssa_value(b, type->type);
+
+   enum glsl_base_type base_type = glsl_get_base_type(type->type);
+   switch (base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+      /* This is where things get interesting.  At this point, we've hit
+       * a vector, a scalar, or a matrix.
+       */
+      if (glsl_type_is_matrix(type->type)) {
+         if (deref == NULL) {
+            /* Loading the whole matrix */
+            struct vtn_ssa_value *transpose;
+            unsigned num_ops, vec_width;
+            if (type->row_major) {
+               num_ops = glsl_get_vector_elements(type->type);
+               vec_width = glsl_get_matrix_columns(type->type);
+               if (load) {
+                  const struct glsl_type *transpose_type =
+                     glsl_matrix_type(base_type, vec_width, num_ops);
+                  *inout = vtn_create_ssa_value(b, transpose_type);
+               } else {
+                  transpose = vtn_ssa_transpose(b, *inout);
+                  inout = &transpose;
+               }
+            } else {
+               num_ops = glsl_get_matrix_columns(type->type);
+               vec_width = glsl_get_vector_elements(type->type);
+            }
+
+            for (unsigned i = 0; i < num_ops; i++) {
+               nir_ssa_def *elem_offset =
+                  nir_iadd(&b->nb, offset,
+                           nir_imm_int(&b->nb, i * type->stride));
+               _vtn_load_store_tail(b, op, load, index, elem_offset,
+                                    &(*inout)->elems[i],
+                                    glsl_vector_type(base_type, vec_width));
+            }
+
+            if (load && type->row_major)
+               *inout = vtn_ssa_transpose(b, *inout);
+
+            return;
+         } else if (type->row_major) {
+            /* Row-major but with a deref. */
+            nir_ssa_def *col_offset =
+               nir_imul(&b->nb, deref_array_offset(b, deref),
+                        nir_imm_int(&b->nb, type->array_element->stride));
+            offset = nir_iadd(&b->nb, offset, col_offset);
+
+            if (deref->child) {
+               /* Picking off a single element */
+               nir_ssa_def *row_offset =
+                  nir_imul(&b->nb, deref_array_offset(b, deref->child),
+                           nir_imm_int(&b->nb, type->stride));
+               offset = nir_iadd(&b->nb, offset, row_offset);
+               _vtn_load_store_tail(b, op, load, index, offset, inout,
+                                    glsl_scalar_type(base_type));
+               return;
+            } else {
+               unsigned num_comps = glsl_get_vector_elements(type->type);
+               nir_ssa_def *comps[4];
+               for (unsigned i = 0; i < num_comps; i++) {
+                  nir_ssa_def *elem_offset =
+                     nir_iadd(&b->nb, offset,
+                              nir_imm_int(&b->nb, i * type->stride));
+
+                  struct vtn_ssa_value *comp = NULL, temp_val;
+                  if (!load) {
+                     temp_val.def = nir_channel(&b->nb, (*inout)->def, i);
+                     temp_val.type = glsl_scalar_type(base_type);
+                     comp = &temp_val;
+                  }
+                  _vtn_load_store_tail(b, op, load, index, elem_offset,
+                                       &comp, glsl_scalar_type(base_type));
+                  comps[i] = comp->def;
+               }
+
+               if (load)
+                  (*inout)->def = nir_vec(&b->nb, comps, num_comps);
+               return;
+            }
+         } else {
+            /* Column-major with a deref. Fall through to array case. */
+         }
+      } else if (deref == NULL) {
+         assert(glsl_type_is_vector_or_scalar(type->type));
+         _vtn_load_store_tail(b, op, load, index, offset, inout, type->type);
+         return;
+      } else {
+         /* Single component of a vector. Fall through to array case. */
+      }
+      /* Fall through */
+
+   case GLSL_TYPE_ARRAY:
+      if (deref) {
+         offset = nir_iadd(&b->nb, offset,
+                           nir_imul(&b->nb, deref_array_offset(b, deref),
+                                    nir_imm_int(&b->nb, type->stride)));
+
+         _vtn_block_load_store(b, op, load, index, offset, deref->child,
+                               type->array_element, inout);
+         return;
+      } else {
+         unsigned elems = glsl_get_length(type->type);
+         for (unsigned i = 0; i < elems; i++) {
+            nir_ssa_def *elem_off =
+               nir_iadd(&b->nb, offset, nir_imm_int(&b->nb, i * type->stride));
+            _vtn_block_load_store(b, op, load, index, elem_off, NULL,
+                                  type->array_element, &(*inout)->elems[i]);
+         }
+         return;
+      }
+      unreachable("Both branches above return");
+
+   case GLSL_TYPE_STRUCT:
+      if (deref) {
+         unsigned member = nir_deref_as_struct(deref)->index;
+         offset = nir_iadd(&b->nb, offset,
+                           nir_imm_int(&b->nb, type->offsets[member]));
+
+         _vtn_block_load_store(b, op, load, index, offset, deref->child,
+                               type->members[member], inout);
+         return;
+      } else {
+         unsigned elems = glsl_get_length(type->type);
+         for (unsigned i = 0; i < elems; i++) {
+            nir_ssa_def *elem_off =
+               nir_iadd(&b->nb, offset, nir_imm_int(&b->nb, type->offsets[i]));
+            _vtn_block_load_store(b, op, load, index, elem_off, NULL,
+                                  type->members[i], &(*inout)->elems[i]);
+         }
+         return;
+      }
+      unreachable("Both branches above return");
+
+   default:
+      unreachable("Invalid block member type");
+   }
+}
+
+static struct vtn_ssa_value *
+vtn_block_load(struct vtn_builder *b, nir_deref_var *src,
+               struct vtn_type *type)
+{
+   nir_intrinsic_op op;
+   if (src->var->data.mode == nir_var_uniform) {
+      if (src->var->data.descriptor_set >= 0) {
+         /* UBO load */
+         assert(src->var->data.binding >= 0);
+
+         op = nir_intrinsic_load_ubo;
+      } else {
+         /* Push constant load */
+         assert(src->var->data.descriptor_set == -1 &&
+                src->var->data.binding == -1);
+
+         op = nir_intrinsic_load_push_constant;
+      }
+   } else {
+      assert(src->var->data.mode == nir_var_shader_storage);
+      op = nir_intrinsic_load_ssbo;
+   }
+
+   nir_deref *block_deref = &src->deref;
+   nir_ssa_def *index = NULL;
+   if (op == nir_intrinsic_load_ubo || op == nir_intrinsic_load_ssbo)
+      index = get_vulkan_resource_index(b, &block_deref, &type);
+
+   struct vtn_ssa_value *value = NULL;
+   _vtn_block_load_store(b, op, true, index, nir_imm_int(&b->nb, 0),
+                         block_deref->child, type, &value);
+   return value;
+}
+
+/*
+ * Gets the NIR-level deref tail, which may have as a child an array deref
+ * selecting which component due to OpAccessChain supporting per-component
+ * indexing in SPIR-V.
+ */
+
+static nir_deref *
+get_deref_tail(nir_deref_var *deref)
+{
+   nir_deref *cur = &deref->deref;
+   while (!glsl_type_is_vector_or_scalar(cur->type) && cur->child)
+      cur = cur->child;
+
+   return cur;
+}
+
+static nir_ssa_def *vtn_vector_extract(struct vtn_builder *b,
+                                       nir_ssa_def *src, unsigned index);
+
+static nir_ssa_def *vtn_vector_extract_dynamic(struct vtn_builder *b,
+                                               nir_ssa_def *src,
+                                               nir_ssa_def *index);
+
+static bool
+variable_is_external_block(nir_variable *var)
+{
+   return var->interface_type &&
+          glsl_type_is_struct(var->interface_type) &&
+          (var->data.mode == nir_var_uniform ||
+           var->data.mode == nir_var_shader_storage);
+}
+
+struct vtn_ssa_value *
+vtn_variable_load(struct vtn_builder *b, nir_deref_var *src,
+                  struct vtn_type *src_type)
+{
+   if (variable_is_external_block(src->var))
+      return vtn_block_load(b, src, src_type);
+
+   nir_deref *src_tail = get_deref_tail(src);
+   struct vtn_ssa_value *val = _vtn_variable_load(b, src, src_tail);
+
+   if (src_tail->child) {
+      nir_deref_array *vec_deref = nir_deref_as_array(src_tail->child);
+      assert(vec_deref->deref.child == NULL);
+      val->type = vec_deref->deref.type;
+      if (vec_deref->deref_array_type == nir_deref_array_type_direct)
+         val->def = vtn_vector_extract(b, val->def, vec_deref->base_offset);
+      else
+         val->def = vtn_vector_extract_dynamic(b, val->def,
+                                               vec_deref->indirect.ssa);
+   }
+
+   return val;
+}
+
+static void
+vtn_block_store(struct vtn_builder *b, struct vtn_ssa_value *src,
+                nir_deref_var *dest, struct vtn_type *type)
+{
+   nir_deref *block_deref = &dest->deref;
+   nir_ssa_def *index = get_vulkan_resource_index(b, &block_deref, &type);
+
+   _vtn_block_load_store(b, nir_intrinsic_store_ssbo, false, index,
+                         nir_imm_int(&b->nb, 0), block_deref->child,
+                         type, &src);
+}
+
+static nir_ssa_def * vtn_vector_insert(struct vtn_builder *b,
+                                       nir_ssa_def *src, nir_ssa_def *insert,
+                                       unsigned index);
+
+static nir_ssa_def * vtn_vector_insert_dynamic(struct vtn_builder *b,
+                                               nir_ssa_def *src,
+                                               nir_ssa_def *insert,
+                                               nir_ssa_def *index);
+void
+vtn_variable_store(struct vtn_builder *b, struct vtn_ssa_value *src,
+                   nir_deref_var *dest, struct vtn_type *dest_type)
+{
+   if (variable_is_external_block(dest->var)) {
+      assert(dest->var->data.mode == nir_var_shader_storage);
+      vtn_block_store(b, src, dest, dest_type);
+   } else {
+      nir_deref *dest_tail = get_deref_tail(dest);
+      if (dest_tail->child) {
+         struct vtn_ssa_value *val = _vtn_variable_load(b, dest, dest_tail);
+         nir_deref_array *deref = nir_deref_as_array(dest_tail->child);
+         assert(deref->deref.child == NULL);
+         if (deref->deref_array_type == nir_deref_array_type_direct)
+            val->def = vtn_vector_insert(b, val->def, src->def,
+                                         deref->base_offset);
+         else
+            val->def = vtn_vector_insert_dynamic(b, val->def, src->def,
+                                                 deref->indirect.ssa);
+         _vtn_variable_store(b, dest, dest_tail, val);
+      } else {
+         _vtn_variable_store(b, dest, dest_tail, src);
+      }
+   }
+}
+
+static void
+vtn_variable_copy(struct vtn_builder *b,
+                  nir_deref_var *dest, struct vtn_type *dest_type,
+                  nir_deref_var *src, struct vtn_type *src_type)
+{
+   if (src->var->interface_type || dest->var->interface_type) {
+      struct vtn_ssa_value *val = vtn_variable_load(b, src, src_type);
+      vtn_variable_store(b, val, dest, dest_type);
+   } else {
+      nir_intrinsic_instr *copy =
+         nir_intrinsic_instr_create(b->shader, nir_intrinsic_copy_var);
+      copy->variables[0] = nir_deref_as_var(nir_copy_deref(copy, &dest->deref));
+      copy->variables[1] = nir_deref_as_var(nir_copy_deref(copy, &src->deref));
+
+      nir_builder_instr_insert(&b->nb, &copy->instr);
+   }
+}
+
+/* Tries to compute the size of an interface block based on the strides and
+ * offsets that are provided to us in the SPIR-V source.
+ */
+static unsigned
+vtn_type_block_size(struct vtn_type *type)
+{
+   enum glsl_base_type base_type = glsl_get_base_type(type->type);
+   switch (base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_DOUBLE: {
+      unsigned cols = type->row_major ? glsl_get_vector_elements(type->type) :
+                                        glsl_get_matrix_columns(type->type);
+      if (cols > 1) {
+         assert(type->stride > 0);
+         return type->stride * cols;
+      } else if (base_type == GLSL_TYPE_DOUBLE) {
+         return glsl_get_vector_elements(type->type) * 8;
+      } else {
+         return glsl_get_vector_elements(type->type) * 4;
+      }
+   }
+
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_INTERFACE: {
+      unsigned size = 0;
+      unsigned num_fields = glsl_get_length(type->type);
+      for (unsigned f = 0; f < num_fields; f++) {
+         unsigned field_end = type->offsets[f] +
+                              vtn_type_block_size(type->members[f]);
+         size = MAX2(size, field_end);
+      }
+      return size;
+   }
+
+   case GLSL_TYPE_ARRAY:
+      assert(type->stride > 0);
+      assert(glsl_get_length(type->type) > 0);
+      return type->stride * glsl_get_length(type->type);
+
+   default:
+      assert(!"Invalid block type");
+      return 0;
+   }
+}
+
+static bool
+is_interface_type(struct vtn_type *type)
+{
+   return type->block || type->buffer_block ||
+          glsl_type_is_sampler(type->type) ||
+          glsl_type_is_image(type->type);
+}
+
+static void
+vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
+                     const uint32_t *w, unsigned count)
+{
+   switch (opcode) {
+   case SpvOpVariable: {
+      struct vtn_type *type =
+         vtn_value(b, w[1], vtn_value_type_type)->type;
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_deref);
+      SpvStorageClass storage_class = w[3];
+
+      nir_variable *var = rzalloc(b->shader, nir_variable);
+
+      var->type = type->type;
+      var->name = ralloc_strdup(var, val->name);
+
+      struct vtn_type *interface_type;
+      if (is_interface_type(type)) {
+         interface_type = type;
+      } else if (glsl_type_is_array(type->type) &&
+                 is_interface_type(type->array_element)) {
+         interface_type = type->array_element;
+      } else {
+         interface_type = NULL;
+      }
+
+      if (interface_type)
+         var->interface_type = interface_type->type;
+
+      switch (storage_class) {
+      case SpvStorageClassUniform:
+      case SpvStorageClassUniformConstant:
+         if (interface_type && interface_type->buffer_block) {
+            var->data.mode = nir_var_shader_storage;
+            b->shader->info.num_ssbos++;
+         } else {
+            /* UBO's and samplers */
+            var->data.mode = nir_var_uniform;
+            var->data.read_only = true;
+            if (interface_type) {
+               if (glsl_type_is_image(interface_type->type)) {
+                  b->shader->info.num_images++;
+                  var->data.image.format = interface_type->image_format;
+
+                  switch (interface_type->access_qualifier) {
+                  case SpvAccessQualifierReadOnly:
+                     var->data.image.read_only = true;
+                     break;
+                  case SpvAccessQualifierWriteOnly:
+                     var->data.image.write_only = true;
+                     break;
+                  default:
+                     break;
+                  }
+               } else if (glsl_type_is_sampler(interface_type->type)) {
+                  b->shader->info.num_textures++;
+               } else {
+                  assert(glsl_type_is_struct(interface_type->type));
+                  b->shader->info.num_ubos++;
+               }
+            }
+         }
+         break;
+      case SpvStorageClassPushConstant:
+         assert(interface_type && interface_type->block);
+         var->data.mode = nir_var_uniform;
+         var->data.read_only = true;
+         var->data.descriptor_set = -1;
+         var->data.binding = -1;
+
+         /* We have exactly one push constant block */
+         assert(b->shader->num_uniforms == 0);
+         b->shader->num_uniforms = vtn_type_block_size(type) * 4;
+         break;
+      case SpvStorageClassInput:
+         var->data.mode = nir_var_shader_in;
+         var->data.read_only = true;
+         break;
+      case SpvStorageClassOutput:
+         var->data.mode = nir_var_shader_out;
+         break;
+      case SpvStorageClassPrivate:
+         var->data.mode = nir_var_global;
+         var->interface_type = NULL;
+         break;
+      case SpvStorageClassFunction:
+         var->data.mode = nir_var_local;
+         var->interface_type = NULL;
+         break;
+      case SpvStorageClassWorkgroup:
+         var->data.mode = nir_var_shared;
+         break;
+      case SpvStorageClassCrossWorkgroup:
+      case SpvStorageClassGeneric:
+      case SpvStorageClassAtomicCounter:
+      default:
+         unreachable("Unhandled variable storage class");
+      }
+
+      if (count > 4) {
+         assert(count == 5);
+         nir_constant *constant =
+            vtn_value(b, w[4], vtn_value_type_constant)->constant;
+         var->constant_initializer = nir_constant_clone(constant, var);
+      }
+
+      val->deref = nir_deref_var_create(b, var);
+      val->deref_type = type;
+
+      /* We handle decorations first because decorations might give us
+       * location information.  We use the data.explicit_location field to
+       * note that the location provided is the "final" location.  If
+       * data.explicit_location == false, this means that it's relative to
+       * whatever the base location is.
+       */
+      vtn_foreach_decoration(b, val, var_decoration_cb, var);
+
+      if (!var->data.explicit_location) {
+         if (b->shader->stage == MESA_SHADER_FRAGMENT &&
+             var->data.mode == nir_var_shader_out) {
+            var->data.location += FRAG_RESULT_DATA0;
+         } else if (b->shader->stage == MESA_SHADER_VERTEX &&
+                    var->data.mode == nir_var_shader_in) {
+            var->data.location += VERT_ATTRIB_GENERIC0;
+         } else if (var->data.mode == nir_var_shader_in ||
+                    var->data.mode == nir_var_shader_out) {
+            var->data.location += VARYING_SLOT_VAR0;
+         }
+      }
+
+      /* XXX: Work around what appears to be a glslang bug.  While the
+       * SPIR-V spec doesn't say that setting a descriptor set on a push
+       * constant is invalid, it certainly makes no sense.  However, at
+       * some point, glslang started setting descriptor set 0 on push
+       * constants for some unknown reason.  Hopefully this can be removed
+       * at some point in the future.
+       */
+      if (storage_class == SpvStorageClassPushConstant) {
+         var->data.descriptor_set = -1;
+         var->data.binding = -1;
+      }
+
+      /* Interface block variables aren't actually going to be referenced
+       * by the generated NIR, so we don't put them in the list
+       */
+      if (var->interface_type && glsl_type_is_struct(var->interface_type))
+         break;
+
+      if (var->data.mode == nir_var_local) {
+         nir_function_impl_add_variable(b->impl, var);
+      } else {
+         nir_shader_add_variable(b->shader, var);
+      }
+
+      break;
+   }
+
+   case SpvOpAccessChain:
+   case SpvOpInBoundsAccessChain: {
+      nir_deref_var *base;
+      struct vtn_value *base_val = vtn_untyped_value(b, w[3]);
+      if (base_val->value_type == vtn_value_type_sampled_image) {
+         /* This is rather insane.  SPIR-V allows you to use OpSampledImage
+          * to combine an array of images with a single sampler to get an
+          * array of sampled images that all share the same sampler.
+          * Fortunately, this means that we can more-or-less ignore the
+          * sampler when crawling the access chain, but it does leave us
+          * with this rather awkward little special-case.
+          */
+         base = base_val->sampled_image->image;
+      } else {
+         assert(base_val->value_type == vtn_value_type_deref);
+         base = base_val->deref;
+      }
+
+      nir_deref_var *deref = nir_deref_as_var(nir_copy_deref(b, &base->deref));
+      struct vtn_type *deref_type = vtn_value(b, w[3], vtn_value_type_deref)->deref_type;
+
+      nir_deref *tail = &deref->deref;
+      while (tail->child)
+         tail = tail->child;
+
+      for (unsigned i = 0; i < count - 4; i++) {
+         assert(w[i + 4] < b->value_id_bound);
+         struct vtn_value *idx_val = &b->values[w[i + 4]];
+
+         enum glsl_base_type base_type = glsl_get_base_type(tail->type);
+         switch (base_type) {
+         case GLSL_TYPE_UINT:
+         case GLSL_TYPE_INT:
+         case GLSL_TYPE_FLOAT:
+         case GLSL_TYPE_DOUBLE:
+         case GLSL_TYPE_BOOL:
+         case GLSL_TYPE_ARRAY: {
+            nir_deref_array *deref_arr = nir_deref_array_create(b);
+            if (base_type == GLSL_TYPE_ARRAY ||
+                glsl_type_is_matrix(tail->type)) {
+               deref_type = deref_type->array_element;
+            } else {
+               assert(glsl_type_is_vector(tail->type));
+               deref_type = ralloc(b, struct vtn_type);
+               deref_type->type = glsl_scalar_type(base_type);
+            }
+
+            deref_arr->deref.type = deref_type->type;
+
+            if (idx_val->value_type == vtn_value_type_constant) {
+               unsigned idx = idx_val->constant->value.u[0];
+               deref_arr->deref_array_type = nir_deref_array_type_direct;
+               deref_arr->base_offset = idx;
+            } else {
+               assert(idx_val->value_type == vtn_value_type_ssa);
+               assert(glsl_type_is_scalar(idx_val->ssa->type));
+               deref_arr->deref_array_type = nir_deref_array_type_indirect;
+               deref_arr->base_offset = 0;
+               deref_arr->indirect = nir_src_for_ssa(idx_val->ssa->def);
+            }
+            tail->child = &deref_arr->deref;
+            break;
+         }
+
+         case GLSL_TYPE_STRUCT: {
+            assert(idx_val->value_type == vtn_value_type_constant);
+            unsigned idx = idx_val->constant->value.u[0];
+            deref_type = deref_type->members[idx];
+            nir_deref_struct *deref_struct = nir_deref_struct_create(b, idx);
+            deref_struct->deref.type = deref_type->type;
+            tail->child = &deref_struct->deref;
+            break;
+         }
+         default:
+            unreachable("Invalid type for deref");
+         }
+
+         if (deref_type->is_builtin) {
+            /* If we encounter a builtin, we throw away the ress of the
+             * access chain, jump to the builtin, and keep building.
+             */
+            const struct glsl_type *builtin_type = deref_type->type;
+
+            nir_deref_array *per_vertex_deref = NULL;
+            if (glsl_type_is_array(base->var->type)) {
+               /* This builtin is a per-vertex builtin */
+               assert(b->shader->stage == MESA_SHADER_GEOMETRY);
+               assert(base->var->data.mode == nir_var_shader_in);
+               builtin_type = glsl_array_type(builtin_type,
+                                              b->shader->info.gs.vertices_in);
+
+               /* The first non-var deref should be an array deref. */
+               assert(deref->deref.child->deref_type ==
+                      nir_deref_type_array);
+               per_vertex_deref = nir_deref_as_array(deref->deref.child);
+            }
+
+            nir_variable *builtin = get_builtin_variable(b,
+                                                         base->var->data.mode,
+                                                         builtin_type,
+                                                         deref_type->builtin);
+            deref = nir_deref_var_create(b, builtin);
+
+            if (per_vertex_deref) {
+               /* Since deref chains start at the variable, we can just
+                * steal that link and use it.
+                */
+               deref->deref.child = &per_vertex_deref->deref;
+               per_vertex_deref->deref.child = NULL;
+               per_vertex_deref->deref.type =
+                  glsl_get_array_element(builtin_type);
+
+               tail = &per_vertex_deref->deref;
+            } else {
+               tail = &deref->deref;
+            }
+         } else {
+            tail = tail->child;
+         }
+      }
+
+      /* For uniform blocks, we don't resolve the access chain until we
+       * actually access the variable, so we need to keep around the original
+       * type of the variable.
+       */
+      if (variable_is_external_block(base->var))
+         deref_type = vtn_value(b, w[3], vtn_value_type_deref)->deref_type;
+
+      if (base_val->value_type == vtn_value_type_sampled_image) {
+         struct vtn_value *val =
+            vtn_push_value(b, w[2], vtn_value_type_sampled_image);
+         val->sampled_image = ralloc(b, struct vtn_sampled_image);
+         val->sampled_image->image = deref;
+         val->sampled_image->sampler = base_val->sampled_image->sampler;
+      } else {
+         struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_deref);
+         val->deref = deref;
+         val->deref_type = deref_type;
+      }
+
+      break;
+   }
+
+   case SpvOpCopyMemory: {
+      struct vtn_value *dest = vtn_value(b, w[1], vtn_value_type_deref);
+      struct vtn_value *src = vtn_value(b, w[2], vtn_value_type_deref);
+
+      vtn_variable_copy(b, dest->deref, dest->deref_type,
+                        src->deref, src->deref_type);
+      break;
+   }
+
+   case SpvOpLoad: {
+      nir_deref_var *src = vtn_value(b, w[3], vtn_value_type_deref)->deref;
+      struct vtn_type *src_type =
+         vtn_value(b, w[3], vtn_value_type_deref)->deref_type;
+
+      if (src->var->interface_type &&
+          (glsl_type_is_sampler(src->var->interface_type) ||
+           glsl_type_is_image(src->var->interface_type))) {
+         vtn_push_value(b, w[2], vtn_value_type_deref)->deref = src;
+         return;
+      }
+
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+      val->ssa = vtn_variable_load(b, src, src_type);
+      break;
+   }
+
+   case SpvOpStore: {
+      nir_deref_var *dest = vtn_value(b, w[1], vtn_value_type_deref)->deref;
+      struct vtn_type *dest_type =
+         vtn_value(b, w[1], vtn_value_type_deref)->deref_type;
+      struct vtn_ssa_value *src = vtn_ssa_value(b, w[2]);
+      vtn_variable_store(b, src, dest, dest_type);
+      break;
+   }
+
+   case SpvOpArrayLength: {
+      struct vtn_value *v_deref = vtn_value(b, w[3], vtn_value_type_deref);
+      struct vtn_type *type = v_deref->deref_type;
+      const uint32_t offset = type->offsets[w[4]];
+      const uint32_t stride = type->members[w[4]]->stride;
+      nir_deref *n_deref = &v_deref->deref->deref;
+      nir_ssa_def *index =
+         get_vulkan_resource_index(b, &n_deref, &type);
+      nir_intrinsic_instr *instr =
+         nir_intrinsic_instr_create(b->nb.shader,
+                                    nir_intrinsic_get_buffer_size);
+      instr->src[0] = nir_src_for_ssa(index);
+      nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+      nir_builder_instr_insert(&b->nb, &instr->instr);
+      nir_ssa_def *buf_size = &instr->dest.ssa;
+
+      /* array_length = max(buffer_size - offset, 0) / stride */
+      nir_ssa_def *array_length =
+         nir_idiv(&b->nb,
+                  nir_imax(&b->nb,
+                           nir_isub(&b->nb,
+                                    buf_size,
+                                    nir_imm_int(&b->nb, offset)),
+                           nir_imm_int(&b->nb, 0u)),
+                  nir_imm_int(&b->nb, stride));
+
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+      val->ssa = vtn_create_ssa_value(b, glsl_uint_type());
+      val->ssa->def = array_length;
+      break;
+   }
+
+   case SpvOpCopyMemorySized:
+   default:
+      unreachable("Unhandled opcode");
+   }
+}
+
+static void
+vtn_handle_function_call(struct vtn_builder *b, SpvOp opcode,
+                         const uint32_t *w, unsigned count)
+{
+   struct nir_function *callee =
+      vtn_value(b, w[3], vtn_value_type_function)->func->impl->function;
+
+   nir_call_instr *call = nir_call_instr_create(b->nb.shader, callee);
+   for (unsigned i = 0; i < call->num_params; i++) {
+      unsigned arg_id = w[4 + i];
+      struct vtn_value *arg = vtn_untyped_value(b, arg_id);
+      if (arg->value_type == vtn_value_type_deref) {
+         call->params[i] =
+            nir_deref_as_var(nir_copy_deref(call, &arg->deref->deref));
+      } else {
+         struct vtn_ssa_value *arg_ssa = vtn_ssa_value(b, arg_id);
+
+         /* Make a temporary to store the argument in */
+         nir_variable *tmp =
+            nir_local_variable_create(b->impl, arg_ssa->type, "arg_tmp");
+         call->params[i] = nir_deref_var_create(call, tmp);
+
+         vtn_variable_store(b, arg_ssa, call->params[i], arg->type);
+      }
+   }
+
+   nir_variable *out_tmp = NULL;
+   if (!glsl_type_is_void(callee->return_type)) {
+      out_tmp = nir_local_variable_create(b->impl, callee->return_type,
+                                          "out_tmp");
+      call->return_deref = nir_deref_var_create(call, out_tmp);
+   }
+
+   nir_builder_instr_insert(&b->nb, &call->instr);
+
+   if (glsl_type_is_void(callee->return_type)) {
+      vtn_push_value(b, w[2], vtn_value_type_undef);
+   } else {
+      struct vtn_type *rettype = vtn_value(b, w[1], vtn_value_type_type)->type;
+      struct vtn_value *retval = vtn_push_value(b, w[2], vtn_value_type_ssa);
+      retval->ssa = vtn_variable_load(b, call->return_deref, rettype);
+   }
+}
+
+struct vtn_ssa_value *
+vtn_create_ssa_value(struct vtn_builder *b, const struct glsl_type *type)
+{
+   struct vtn_ssa_value *val = rzalloc(b, struct vtn_ssa_value);
+   val->type = type;
+
+   if (!glsl_type_is_vector_or_scalar(type)) {
+      unsigned elems = glsl_get_length(type);
+      val->elems = ralloc_array(b, struct vtn_ssa_value *, elems);
+      for (unsigned i = 0; i < elems; i++) {
+         const struct glsl_type *child_type;
+
+         switch (glsl_get_base_type(type)) {
+         case GLSL_TYPE_INT:
+         case GLSL_TYPE_UINT:
+         case GLSL_TYPE_BOOL:
+         case GLSL_TYPE_FLOAT:
+         case GLSL_TYPE_DOUBLE:
+            child_type = glsl_get_column_type(type);
+            break;
+         case GLSL_TYPE_ARRAY:
+            child_type = glsl_get_array_element(type);
+            break;
+         case GLSL_TYPE_STRUCT:
+            child_type = glsl_get_struct_field(type, i);
+            break;
+         default:
+            unreachable("unkown base type");
+         }
+
+         val->elems[i] = vtn_create_ssa_value(b, child_type);
+      }
+   }
+
+   return val;
+}
+
+static nir_tex_src
+vtn_tex_src(struct vtn_builder *b, unsigned index, nir_tex_src_type type)
+{
+   nir_tex_src src;
+   src.src = nir_src_for_ssa(vtn_ssa_value(b, index)->def);
+   src.src_type = type;
+   return src;
+}
+
+static void
+vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
+                   const uint32_t *w, unsigned count)
+{
+   if (opcode == SpvOpSampledImage) {
+      struct vtn_value *val =
+         vtn_push_value(b, w[2], vtn_value_type_sampled_image);
+      val->sampled_image = ralloc(b, struct vtn_sampled_image);
+      val->sampled_image->image =
+         vtn_value(b, w[3], vtn_value_type_deref)->deref;
+      val->sampled_image->sampler =
+         vtn_value(b, w[4], vtn_value_type_deref)->deref;
+      return;
+   }
+
+   struct vtn_type *ret_type = vtn_value(b, w[1], vtn_value_type_type)->type;
+   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+
+   struct vtn_sampled_image sampled;
+   struct vtn_value *sampled_val = vtn_untyped_value(b, w[3]);
+   if (sampled_val->value_type == vtn_value_type_sampled_image) {
+      sampled = *sampled_val->sampled_image;
+   } else {
+      assert(sampled_val->value_type == vtn_value_type_deref);
+      sampled.image = NULL;
+      sampled.sampler = sampled_val->deref;
+   }
+
+   nir_tex_src srcs[8]; /* 8 should be enough */
+   nir_tex_src *p = srcs;
+
+   unsigned idx = 4;
+
+   bool has_coord = false;
+   switch (opcode) {
+   case SpvOpImageSampleImplicitLod:
+   case SpvOpImageSampleExplicitLod:
+   case SpvOpImageSampleDrefImplicitLod:
+   case SpvOpImageSampleDrefExplicitLod:
+   case SpvOpImageSampleProjImplicitLod:
+   case SpvOpImageSampleProjExplicitLod:
+   case SpvOpImageSampleProjDrefImplicitLod:
+   case SpvOpImageSampleProjDrefExplicitLod:
+   case SpvOpImageFetch:
+   case SpvOpImageGather:
+   case SpvOpImageDrefGather:
+   case SpvOpImageQueryLod: {
+      /* All these types have the coordinate as their first real argument */
+      struct vtn_ssa_value *coord = vtn_ssa_value(b, w[idx++]);
+      has_coord = true;
+      p->src = nir_src_for_ssa(coord->def);
+      p->src_type = nir_tex_src_coord;
+      p++;
+      break;
+   }
+
+   default:
+      break;
+   }
+
+   /* These all have an explicit depth value as their next source */
+   switch (opcode) {
+   case SpvOpImageSampleDrefImplicitLod:
+   case SpvOpImageSampleDrefExplicitLod:
+   case SpvOpImageSampleProjDrefImplicitLod:
+   case SpvOpImageSampleProjDrefExplicitLod:
+      (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_comparitor);
+      break;
+   default:
+      break;
+   }
+
+   /* Figure out the base texture operation */
+   nir_texop texop;
+   switch (opcode) {
+   case SpvOpImageSampleImplicitLod:
+   case SpvOpImageSampleDrefImplicitLod:
+   case SpvOpImageSampleProjImplicitLod:
+   case SpvOpImageSampleProjDrefImplicitLod:
+      texop = nir_texop_tex;
+      break;
+
+   case SpvOpImageSampleExplicitLod:
+   case SpvOpImageSampleDrefExplicitLod:
+   case SpvOpImageSampleProjExplicitLod:
+   case SpvOpImageSampleProjDrefExplicitLod:
+      texop = nir_texop_txl;
+      break;
+
+   case SpvOpImageFetch:
+      texop = nir_texop_txf;
+      break;
+
+   case SpvOpImageGather:
+   case SpvOpImageDrefGather:
+      texop = nir_texop_tg4;
+      break;
+
+   case SpvOpImageQuerySizeLod:
+   case SpvOpImageQuerySize:
+      texop = nir_texop_txs;
+      break;
+
+   case SpvOpImageQueryLod:
+      texop = nir_texop_lod;
+      break;
+
+   case SpvOpImageQueryLevels:
+      texop = nir_texop_query_levels;
+      break;
+
+   case SpvOpImageQuerySamples:
+   default:
+      unreachable("Unhandled opcode");
+   }
+
+   /* Now we need to handle some number of optional arguments */
+   if (idx < count) {
+      uint32_t operands = w[idx++];
+
+      if (operands & SpvImageOperandsBiasMask) {
+         assert(texop == nir_texop_tex);
+         texop = nir_texop_txb;
+         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_bias);
+      }
+
+      if (operands & SpvImageOperandsLodMask) {
+         assert(texop == nir_texop_txl || texop == nir_texop_txf ||
+                texop == nir_texop_txs);
+         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_lod);
+      }
+
+      if (operands & SpvImageOperandsGradMask) {
+         assert(texop == nir_texop_tex);
+         texop = nir_texop_txd;
+         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ddx);
+         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ddy);
+      }
+
+      if (operands & SpvImageOperandsOffsetMask ||
+          operands & SpvImageOperandsConstOffsetMask)
+         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_offset);
+
+      if (operands & SpvImageOperandsConstOffsetsMask)
+         assert(!"Constant offsets to texture gather not yet implemented");
+
+      if (operands & SpvImageOperandsSampleMask) {
+         assert(texop == nir_texop_txf);
+         texop = nir_texop_txf_ms;
+         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ms_index);
+      }
+   }
+   /* We should have now consumed exactly all of the arguments */
+   assert(idx == count);
+
+   nir_tex_instr *instr = nir_tex_instr_create(b->shader, p - srcs);
+   instr->op = texop;
+
+   memcpy(instr->src, srcs, instr->num_srcs * sizeof(*instr->src));
+
+   const struct glsl_type *image_type;
+   if (sampled.image) {
+      image_type = nir_deref_tail(&sampled.image->deref)->type;
+   } else {
+      image_type = nir_deref_tail(&sampled.sampler->deref)->type;
+   }
+
+   instr->sampler_dim = glsl_get_sampler_dim(image_type);
+   instr->is_array = glsl_sampler_type_is_array(image_type);
+   instr->is_shadow = glsl_sampler_type_is_shadow(image_type);
+   instr->is_new_style_shadow = instr->is_shadow;
+
+   if (has_coord) {
+      switch (instr->sampler_dim) {
+      case GLSL_SAMPLER_DIM_1D:
+      case GLSL_SAMPLER_DIM_BUF:
+         instr->coord_components = 1;
+         break;
+      case GLSL_SAMPLER_DIM_2D:
+      case GLSL_SAMPLER_DIM_RECT:
+         instr->coord_components = 2;
+         break;
+      case GLSL_SAMPLER_DIM_3D:
+      case GLSL_SAMPLER_DIM_CUBE:
+      case GLSL_SAMPLER_DIM_MS:
+         instr->coord_components = 3;
+         break;
+      default:
+         assert("Invalid sampler type");
+      }
+
+      if (instr->is_array)
+         instr->coord_components++;
+   } else {
+      instr->coord_components = 0;
+   }
+
+   switch (glsl_get_sampler_result_type(image_type)) {
+   case GLSL_TYPE_FLOAT:   instr->dest_type = nir_type_float;     break;
+   case GLSL_TYPE_INT:     instr->dest_type = nir_type_int;       break;
+   case GLSL_TYPE_UINT:    instr->dest_type = nir_type_uint;  break;
+   case GLSL_TYPE_BOOL:    instr->dest_type = nir_type_bool;      break;
+   default:
+      unreachable("Invalid base type for sampler result");
+   }
+
+   instr->sampler =
+      nir_deref_as_var(nir_copy_deref(instr, &sampled.sampler->deref));
+   if (sampled.image) {
+      instr->texture =
+         nir_deref_as_var(nir_copy_deref(instr, &sampled.image->deref));
+   } else {
+      instr->texture = NULL;
+   }
+
+   nir_ssa_dest_init(&instr->instr, &instr->dest,
+                     nir_tex_instr_dest_size(instr), NULL);
+
+   assert(glsl_get_vector_elements(ret_type->type) ==
+          nir_tex_instr_dest_size(instr));
+
+   val->ssa = vtn_create_ssa_value(b, ret_type->type);
+   val->ssa->def = &instr->dest.ssa;
+
+   nir_builder_instr_insert(&b->nb, &instr->instr);
+}
+
+static nir_ssa_def *
+get_image_coord(struct vtn_builder *b, uint32_t value)
+{
+   struct vtn_ssa_value *coord = vtn_ssa_value(b, value);
+
+   /* The image_load_store intrinsics assume a 4-dim coordinate */
+   unsigned dim = glsl_get_vector_elements(coord->type);
+   unsigned swizzle[4];
+   for (unsigned i = 0; i < 4; i++)
+      swizzle[i] = MIN2(i, dim - 1);
+
+   return nir_swizzle(&b->nb, coord->def, swizzle, 4, false);
+}
+
+static void
+vtn_handle_image(struct vtn_builder *b, SpvOp opcode,
+                 const uint32_t *w, unsigned count)
+{
+   /* Just get this one out of the way */
+   if (opcode == SpvOpImageTexelPointer) {
+      struct vtn_value *val =
+         vtn_push_value(b, w[2], vtn_value_type_image_pointer);
+      val->image = ralloc(b, struct vtn_image_pointer);
+
+      val->image->deref = vtn_value(b, w[3], vtn_value_type_deref)->deref;
+      val->image->coord = get_image_coord(b, w[4]);
+      val->image->sample = vtn_ssa_value(b, w[5])->def;
+      return;
+   }
+
+   struct vtn_image_pointer image;
+
+   switch (opcode) {
+   case SpvOpAtomicExchange:
+   case SpvOpAtomicCompareExchange:
+   case SpvOpAtomicCompareExchangeWeak:
+   case SpvOpAtomicIIncrement:
+   case SpvOpAtomicIDecrement:
+   case SpvOpAtomicIAdd:
+   case SpvOpAtomicISub:
+   case SpvOpAtomicSMin:
+   case SpvOpAtomicUMin:
+   case SpvOpAtomicSMax:
+   case SpvOpAtomicUMax:
+   case SpvOpAtomicAnd:
+   case SpvOpAtomicOr:
+   case SpvOpAtomicXor:
+      image = *vtn_value(b, w[3], vtn_value_type_image_pointer)->image;
+      break;
+
+   case SpvOpImageQuerySize:
+      image.deref = vtn_value(b, w[3], vtn_value_type_deref)->deref;
+      image.coord = NULL;
+      image.sample = NULL;
+      break;
+
+   case SpvOpImageRead:
+      image.deref = vtn_value(b, w[3], vtn_value_type_deref)->deref;
+      image.coord = get_image_coord(b, w[4]);
+
+      if (count > 5 && (w[5] & SpvImageOperandsSampleMask)) {
+         assert(w[5] == SpvImageOperandsSampleMask);
+         image.sample = vtn_ssa_value(b, w[6])->def;
+      } else {
+         image.sample = nir_ssa_undef(&b->nb, 1);
+      }
+      break;
+
+   case SpvOpImageWrite:
+      image.deref = vtn_value(b, w[1], vtn_value_type_deref)->deref;
+      image.coord = get_image_coord(b, w[2]);
+
+      /* texel = w[3] */
+
+      if (count > 4 && (w[4] & SpvImageOperandsSampleMask)) {
+         assert(w[4] == SpvImageOperandsSampleMask);
+         image.sample = vtn_ssa_value(b, w[5])->def;
+      } else {
+         image.sample = nir_ssa_undef(&b->nb, 1);
+      }
+      break;
+
+   default:
+      unreachable("Invalid image opcode");
+   }
+
+   nir_intrinsic_op op;
+   switch (opcode) {
+#define OP(S, N) case SpvOp##S: op = nir_intrinsic_image_##N; break;
+   OP(ImageQuerySize,         size)
+   OP(ImageRead,              load)
+   OP(ImageWrite,             store)
+   OP(AtomicExchange,         atomic_exchange)
+   OP(AtomicCompareExchange,  atomic_comp_swap)
+   OP(AtomicIIncrement,       atomic_add)
+   OP(AtomicIDecrement,       atomic_add)
+   OP(AtomicIAdd,             atomic_add)
+   OP(AtomicISub,             atomic_add)
+   OP(AtomicSMin,             atomic_min)
+   OP(AtomicUMin,             atomic_min)
+   OP(AtomicSMax,             atomic_max)
+   OP(AtomicUMax,             atomic_max)
+   OP(AtomicAnd,              atomic_and)
+   OP(AtomicOr,               atomic_or)
+   OP(AtomicXor,              atomic_xor)
+#undef OP
+   default:
+      unreachable("Invalid image opcode");
+   }
+
+   nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->shader, op);
+   intrin->variables[0] =
+      nir_deref_as_var(nir_copy_deref(&intrin->instr, &image.deref->deref));
+
+   /* ImageQuerySize doesn't take any extra parameters */
+   if (opcode != SpvOpImageQuerySize) {
+      /* The image coordinate is always 4 components but we may not have that
+       * many.  Swizzle to compensate.
+       */
+      unsigned swiz[4];
+      for (unsigned i = 0; i < 4; i++)
+         swiz[i] = i < image.coord->num_components ? i : 0;
+      intrin->src[0] = nir_src_for_ssa(nir_swizzle(&b->nb, image.coord,
+                                                   swiz, 4, false));
+      intrin->src[1] = nir_src_for_ssa(image.sample);
+   }
+
+   switch (opcode) {
+   case SpvOpImageQuerySize:
+   case SpvOpImageRead:
+      break;
+   case SpvOpImageWrite:
+      intrin->src[2] = nir_src_for_ssa(vtn_ssa_value(b, w[3])->def);
+      break;
+   case SpvOpAtomicIIncrement:
+      intrin->src[2] = nir_src_for_ssa(nir_imm_int(&b->nb, 1));
+      break;
+   case SpvOpAtomicIDecrement:
+      intrin->src[2] = nir_src_for_ssa(nir_imm_int(&b->nb, -1));
+      break;
+
+   case SpvOpAtomicExchange:
+   case SpvOpAtomicIAdd:
+   case SpvOpAtomicSMin:
+   case SpvOpAtomicUMin:
+   case SpvOpAtomicSMax:
+   case SpvOpAtomicUMax:
+   case SpvOpAtomicAnd:
+   case SpvOpAtomicOr:
+   case SpvOpAtomicXor:
+      intrin->src[2] = nir_src_for_ssa(vtn_ssa_value(b, w[6])->def);
+      break;
+
+   case SpvOpAtomicCompareExchange:
+      intrin->src[2] = nir_src_for_ssa(vtn_ssa_value(b, w[7])->def);
+      intrin->src[3] = nir_src_for_ssa(vtn_ssa_value(b, w[6])->def);
+      break;
+
+   case SpvOpAtomicISub:
+      intrin->src[2] = nir_src_for_ssa(nir_ineg(&b->nb, vtn_ssa_value(b, w[6])->def));
+      break;
+
+   default:
+      unreachable("Invalid image opcode");
+   }
+
+   if (opcode != SpvOpImageWrite) {
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+      struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
+      nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, NULL);
+
+      nir_builder_instr_insert(&b->nb, &intrin->instr);
+
+      /* The image intrinsics always return 4 channels but we may not want
+       * that many.  Emit a mov to trim it down.
+       */
+      unsigned swiz[4] = {0, 1, 2, 3};
+      val->ssa = vtn_create_ssa_value(b, type->type);
+      val->ssa->def = nir_swizzle(&b->nb, &intrin->dest.ssa, swiz,
+                                  glsl_get_vector_elements(type->type), false);
+   } else {
+      nir_builder_instr_insert(&b->nb, &intrin->instr);
+   }
+}
+
+static nir_intrinsic_op
+get_ssbo_nir_atomic_op(SpvOp opcode)
+{
+   switch (opcode) {
+#define OP(S, N) case SpvOp##S: return nir_intrinsic_ssbo_##N;
+   OP(AtomicExchange,         atomic_exchange)
+   OP(AtomicCompareExchange,  atomic_comp_swap)
+   OP(AtomicIIncrement,       atomic_add)
+   OP(AtomicIDecrement,       atomic_add)
+   OP(AtomicIAdd,             atomic_add)
+   OP(AtomicISub,             atomic_add)
+   OP(AtomicSMin,             atomic_imin)
+   OP(AtomicUMin,             atomic_umin)
+   OP(AtomicSMax,             atomic_imax)
+   OP(AtomicUMax,             atomic_umax)
+   OP(AtomicAnd,              atomic_and)
+   OP(AtomicOr,               atomic_or)
+   OP(AtomicXor,              atomic_xor)
+#undef OP
+   default:
+      unreachable("Invalid SSBO atomic");
+   }
+}
+
+static nir_intrinsic_op
+get_shared_nir_atomic_op(SpvOp opcode)
+{
+   switch (opcode) {
+#define OP(S, N) case SpvOp##S: return nir_intrinsic_var_##N;
+   OP(AtomicExchange,         atomic_exchange)
+   OP(AtomicCompareExchange,  atomic_comp_swap)
+   OP(AtomicIIncrement,       atomic_add)
+   OP(AtomicIDecrement,       atomic_add)
+   OP(AtomicIAdd,             atomic_add)
+   OP(AtomicISub,             atomic_add)
+   OP(AtomicSMin,             atomic_imin)
+   OP(AtomicUMin,             atomic_umin)
+   OP(AtomicSMax,             atomic_imax)
+   OP(AtomicUMax,             atomic_umax)
+   OP(AtomicAnd,              atomic_and)
+   OP(AtomicOr,               atomic_or)
+   OP(AtomicXor,              atomic_xor)
+#undef OP
+   default:
+      unreachable("Invalid shared atomic");
+   }
+}
+
+static nir_ssa_def *
+get_ssbo_atomic_offset(struct vtn_builder *b, nir_deref *deref, struct vtn_type *type)
+{
+   nir_ssa_def *offset = nir_imm_int(&b->nb, 0);
+
+   while (deref->child) {
+      deref = deref->child;
+      switch (deref->deref_type) {
+      case nir_deref_type_array:
+         offset = nir_iadd(&b->nb, offset,
+                           nir_imul(&b->nb, deref_array_offset(b, deref),
+                                    nir_imm_int(&b->nb, type->stride)));
+         type = type->array_element;
+         continue;
+
+      case nir_deref_type_struct: {
+         unsigned member = nir_deref_as_struct(deref)->index;
+         offset = nir_iadd(&b->nb, offset,
+                           nir_imm_int(&b->nb, type->offsets[member]));
+         type = type->members[member];
+         continue;
+      }
+
+      default:
+         unreachable("Invalid deref type");
+      }
+   }
+
+   return offset;
+}
+
+static void
+fill_common_atomic_sources(struct vtn_builder *b, SpvOp opcode,
+                           const uint32_t *w, nir_src *src)
+{
+   switch (opcode) {
+   case SpvOpAtomicIIncrement:
+      src[0] = nir_src_for_ssa(nir_imm_int(&b->nb, 1));
+      break;
+
+   case SpvOpAtomicIDecrement:
+      src[0] = nir_src_for_ssa(nir_imm_int(&b->nb, -1));
+      break;
+
+   case SpvOpAtomicISub:
+      src[0] =
+         nir_src_for_ssa(nir_ineg(&b->nb, vtn_ssa_value(b, w[6])->def));
+      break;
+
+   case SpvOpAtomicCompareExchange:
+      src[0] = nir_src_for_ssa(vtn_ssa_value(b, w[7])->def);
+      src[1] = nir_src_for_ssa(vtn_ssa_value(b, w[8])->def);
+      break;
+      /* Fall through */
+
+   case SpvOpAtomicExchange:
+   case SpvOpAtomicIAdd:
+   case SpvOpAtomicSMin:
+   case SpvOpAtomicUMin:
+   case SpvOpAtomicSMax:
+   case SpvOpAtomicUMax:
+   case SpvOpAtomicAnd:
+   case SpvOpAtomicOr:
+   case SpvOpAtomicXor:
+      src[0] = nir_src_for_ssa(vtn_ssa_value(b, w[6])->def);
+      break;
+
+   default:
+      unreachable("Invalid SPIR-V atomic");
+   }
+}
+
+static void
+vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
+                                 const uint32_t *w, unsigned count)
+{
+   struct vtn_value *pointer = vtn_value(b, w[3], vtn_value_type_deref);
+   struct vtn_type *type = pointer->deref_type;
+   nir_deref *deref = &pointer->deref->deref;
+   nir_intrinsic_instr *atomic;
+
+   /*
+   SpvScope scope = w[4];
+   SpvMemorySemanticsMask semantics = w[5];
+   */
+
+   if (pointer->deref->var->data.mode == nir_var_shared) {
+      nir_intrinsic_op op = get_shared_nir_atomic_op(opcode);
+      atomic = nir_intrinsic_instr_create(b->nb.shader, op);
+      atomic->variables[0] =
+         nir_deref_as_var(nir_copy_deref(atomic, &pointer->deref->deref));
+      fill_common_atomic_sources(b, opcode, w, &atomic->src[0]);
+   } else {
+      nir_ssa_def *index = get_vulkan_resource_index(b, &deref, &type);
+      nir_ssa_def *offset = get_ssbo_atomic_offset(b, deref, type);
+      nir_intrinsic_op op = get_ssbo_nir_atomic_op(opcode);
+
+      atomic = nir_intrinsic_instr_create(b->nb.shader, op);
+      atomic->src[0] = nir_src_for_ssa(index);
+      atomic->src[1] = nir_src_for_ssa(offset);
+      fill_common_atomic_sources(b, opcode, w, &atomic->src[2]);
+   }
+
+   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, NULL);
+
+   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+   val->ssa = rzalloc(b, struct vtn_ssa_value);
+   val->ssa->def = &atomic->dest.ssa;
+   val->ssa->type = type->type;
+
+   nir_builder_instr_insert(&b->nb, &atomic->instr);
+}
+
+static nir_alu_instr *
+create_vec(nir_shader *shader, unsigned num_components)
+{
+   nir_op op;
+   switch (num_components) {
+   case 1: op = nir_op_fmov; break;
+   case 2: op = nir_op_vec2; break;
+   case 3: op = nir_op_vec3; break;
+   case 4: op = nir_op_vec4; break;
+   default: unreachable("bad vector size");
+   }
+
+   nir_alu_instr *vec = nir_alu_instr_create(shader, op);
+   nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, NULL);
+   vec->dest.write_mask = (1 << num_components) - 1;
+
+   return vec;
+}
+
+struct vtn_ssa_value *
+vtn_ssa_transpose(struct vtn_builder *b, struct vtn_ssa_value *src)
+{
+   if (src->transposed)
+      return src->transposed;
+
+   struct vtn_ssa_value *dest =
+      vtn_create_ssa_value(b, glsl_transposed_type(src->type));
+
+   for (unsigned i = 0; i < glsl_get_matrix_columns(dest->type); i++) {
+      nir_alu_instr *vec = create_vec(b->shader,
+                                      glsl_get_matrix_columns(src->type));
+      if (glsl_type_is_vector_or_scalar(src->type)) {
+          vec->src[0].src = nir_src_for_ssa(src->def);
+          vec->src[0].swizzle[0] = i;
+      } else {
+         for (unsigned j = 0; j < glsl_get_matrix_columns(src->type); j++) {
+            vec->src[j].src = nir_src_for_ssa(src->elems[j]->def);
+            vec->src[j].swizzle[0] = i;
+         }
+      }
+      nir_builder_instr_insert(&b->nb, &vec->instr);
+      dest->elems[i]->def = &vec->dest.dest.ssa;
+   }
+
+   dest->transposed = src;
+
+   return dest;
+}
+
+static nir_ssa_def *
+vtn_vector_extract(struct vtn_builder *b, nir_ssa_def *src, unsigned index)
+{
+   unsigned swiz[4] = { index };
+   return nir_swizzle(&b->nb, src, swiz, 1, true);
+}
+
+
+static nir_ssa_def *
+vtn_vector_insert(struct vtn_builder *b, nir_ssa_def *src, nir_ssa_def *insert,
+                  unsigned index)
+{
+   nir_alu_instr *vec = create_vec(b->shader, src->num_components);
+
+   for (unsigned i = 0; i < src->num_components; i++) {
+      if (i == index) {
+         vec->src[i].src = nir_src_for_ssa(insert);
+      } else {
+         vec->src[i].src = nir_src_for_ssa(src);
+         vec->src[i].swizzle[0] = i;
+      }
+   }
+
+   nir_builder_instr_insert(&b->nb, &vec->instr);
+
+   return &vec->dest.dest.ssa;
+}
+
+static nir_ssa_def *
+vtn_vector_extract_dynamic(struct vtn_builder *b, nir_ssa_def *src,
+                           nir_ssa_def *index)
+{
+   nir_ssa_def *dest = vtn_vector_extract(b, src, 0);
+   for (unsigned i = 1; i < src->num_components; i++)
+      dest = nir_bcsel(&b->nb, nir_ieq(&b->nb, index, nir_imm_int(&b->nb, i)),
+                       vtn_vector_extract(b, src, i), dest);
+
+   return dest;
+}
+
+static nir_ssa_def *
+vtn_vector_insert_dynamic(struct vtn_builder *b, nir_ssa_def *src,
+                          nir_ssa_def *insert, nir_ssa_def *index)
+{
+   nir_ssa_def *dest = vtn_vector_insert(b, src, insert, 0);
+   for (unsigned i = 1; i < src->num_components; i++)
+      dest = nir_bcsel(&b->nb, nir_ieq(&b->nb, index, nir_imm_int(&b->nb, i)),
+                       vtn_vector_insert(b, src, insert, i), dest);
+
+   return dest;
+}
+
+static nir_ssa_def *
+vtn_vector_shuffle(struct vtn_builder *b, unsigned num_components,
+                   nir_ssa_def *src0, nir_ssa_def *src1,
+                   const uint32_t *indices)
+{
+   nir_alu_instr *vec = create_vec(b->shader, num_components);
+
+   nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(b->shader, 1);
+   nir_builder_instr_insert(&b->nb, &undef->instr);
+
+   for (unsigned i = 0; i < num_components; i++) {
+      uint32_t index = indices[i];
+      if (index == 0xffffffff) {
+         vec->src[i].src = nir_src_for_ssa(&undef->def);
+      } else if (index < src0->num_components) {
+         vec->src[i].src = nir_src_for_ssa(src0);
+         vec->src[i].swizzle[0] = index;
+      } else {
+         vec->src[i].src = nir_src_for_ssa(src1);
+         vec->src[i].swizzle[0] = index - src0->num_components;
+      }
+   }
+
+   nir_builder_instr_insert(&b->nb, &vec->instr);
+
+   return &vec->dest.dest.ssa;
+}
+
+/*
+ * Concatentates a number of vectors/scalars together to produce a vector
+ */
+static nir_ssa_def *
+vtn_vector_construct(struct vtn_builder *b, unsigned num_components,
+                     unsigned num_srcs, nir_ssa_def **srcs)
+{
+   nir_alu_instr *vec = create_vec(b->shader, num_components);
+
+   unsigned dest_idx = 0;
+   for (unsigned i = 0; i < num_srcs; i++) {
+      nir_ssa_def *src = srcs[i];
+      for (unsigned j = 0; j < src->num_components; j++) {
+         vec->src[dest_idx].src = nir_src_for_ssa(src);
+         vec->src[dest_idx].swizzle[0] = j;
+         dest_idx++;
+      }
+   }
+
+   nir_builder_instr_insert(&b->nb, &vec->instr);
+
+   return &vec->dest.dest.ssa;
+}
+
+static struct vtn_ssa_value *
+vtn_composite_copy(void *mem_ctx, struct vtn_ssa_value *src)
+{
+   struct vtn_ssa_value *dest = rzalloc(mem_ctx, struct vtn_ssa_value);
+   dest->type = src->type;
+
+   if (glsl_type_is_vector_or_scalar(src->type)) {
+      dest->def = src->def;
+   } else {
+      unsigned elems = glsl_get_length(src->type);
+
+      dest->elems = ralloc_array(mem_ctx, struct vtn_ssa_value *, elems);
+      for (unsigned i = 0; i < elems; i++)
+         dest->elems[i] = vtn_composite_copy(mem_ctx, src->elems[i]);
+   }
+
+   return dest;
+}
+
+static struct vtn_ssa_value *
+vtn_composite_insert(struct vtn_builder *b, struct vtn_ssa_value *src,
+                     struct vtn_ssa_value *insert, const uint32_t *indices,
+                     unsigned num_indices)
+{
+   struct vtn_ssa_value *dest = vtn_composite_copy(b, src);
+
+   struct vtn_ssa_value *cur = dest;
+   unsigned i;
+   for (i = 0; i < num_indices - 1; i++) {
+      cur = cur->elems[indices[i]];
+   }
+
+   if (glsl_type_is_vector_or_scalar(cur->type)) {
+      /* According to the SPIR-V spec, OpCompositeInsert may work down to
+       * the component granularity. In that case, the last index will be
+       * the index to insert the scalar into the vector.
+       */
+
+      cur->def = vtn_vector_insert(b, cur->def, insert->def, indices[i]);
+   } else {
+      cur->elems[indices[i]] = insert;
+   }
+
+   return dest;
+}
+
+static struct vtn_ssa_value *
+vtn_composite_extract(struct vtn_builder *b, struct vtn_ssa_value *src,
+                      const uint32_t *indices, unsigned num_indices)
+{
+   struct vtn_ssa_value *cur = src;
+   for (unsigned i = 0; i < num_indices; i++) {
+      if (glsl_type_is_vector_or_scalar(cur->type)) {
+         assert(i == num_indices - 1);
+         /* According to the SPIR-V spec, OpCompositeExtract may work down to
+          * the component granularity. The last index will be the index of the
+          * vector to extract.
+          */
+
+         struct vtn_ssa_value *ret = rzalloc(b, struct vtn_ssa_value);
+         ret->type = glsl_scalar_type(glsl_get_base_type(cur->type));
+         ret->def = vtn_vector_extract(b, cur->def, indices[i]);
+         return ret;
+      } else {
+         cur = cur->elems[indices[i]];
+      }
+   }
+
+   return cur;
+}
+
+static void
+vtn_handle_composite(struct vtn_builder *b, SpvOp opcode,
+                     const uint32_t *w, unsigned count)
+{
+   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+   const struct glsl_type *type =
+      vtn_value(b, w[1], vtn_value_type_type)->type->type;
+   val->ssa = vtn_create_ssa_value(b, type);
+
+   switch (opcode) {
+   case SpvOpVectorExtractDynamic:
+      val->ssa->def = vtn_vector_extract_dynamic(b, vtn_ssa_value(b, w[3])->def,
+                                                 vtn_ssa_value(b, w[4])->def);
+      break;
+
+   case SpvOpVectorInsertDynamic:
+      val->ssa->def = vtn_vector_insert_dynamic(b, vtn_ssa_value(b, w[3])->def,
+                                                vtn_ssa_value(b, w[4])->def,
+                                                vtn_ssa_value(b, w[5])->def);
+      break;
+
+   case SpvOpVectorShuffle:
+      val->ssa->def = vtn_vector_shuffle(b, glsl_get_vector_elements(type),
+                                         vtn_ssa_value(b, w[3])->def,
+                                         vtn_ssa_value(b, w[4])->def,
+                                         w + 5);
+      break;
+
+   case SpvOpCompositeConstruct: {
+      unsigned elems = count - 3;
+      if (glsl_type_is_vector_or_scalar(type)) {
+         nir_ssa_def *srcs[4];
+         for (unsigned i = 0; i < elems; i++)
+            srcs[i] = vtn_ssa_value(b, w[3 + i])->def;
+         val->ssa->def =
+            vtn_vector_construct(b, glsl_get_vector_elements(type),
+                                 elems, srcs);
+      } else {
+         val->ssa->elems = ralloc_array(b, struct vtn_ssa_value *, elems);
+         for (unsigned i = 0; i < elems; i++)
+            val->ssa->elems[i] = vtn_ssa_value(b, w[3 + i]);
+      }
+      break;
+   }
+   case SpvOpCompositeExtract:
+      val->ssa = vtn_composite_extract(b, vtn_ssa_value(b, w[3]),
+                                       w + 4, count - 4);
+      break;
+
+   case SpvOpCompositeInsert:
+      val->ssa = vtn_composite_insert(b, vtn_ssa_value(b, w[4]),
+                                      vtn_ssa_value(b, w[3]),
+                                      w + 5, count - 5);
+      break;
+
+   case SpvOpCopyObject:
+      val->ssa = vtn_composite_copy(b, vtn_ssa_value(b, w[3]));
+      break;
+
+   default:
+      unreachable("unknown composite operation");
+   }
+}
+
+static void
+vtn_handle_barrier(struct vtn_builder *b, SpvOp opcode,
+                   const uint32_t *w, unsigned count)
+{
+   nir_intrinsic_op intrinsic_op;
+   switch (opcode) {
+   case SpvOpEmitVertex:
+   case SpvOpEmitStreamVertex:
+      intrinsic_op = nir_intrinsic_emit_vertex;
+      break;
+   case SpvOpEndPrimitive:
+   case SpvOpEndStreamPrimitive:
+      intrinsic_op = nir_intrinsic_end_primitive;
+      break;
+   case SpvOpMemoryBarrier:
+      intrinsic_op = nir_intrinsic_memory_barrier;
+      break;
+   case SpvOpControlBarrier:
+      intrinsic_op = nir_intrinsic_barrier;
+      break;
+   default:
+      unreachable("unknown barrier instruction");
+   }
+
+   nir_intrinsic_instr *intrin =
+      nir_intrinsic_instr_create(b->shader, intrinsic_op);
+
+   if (opcode == SpvOpEmitStreamVertex || opcode == SpvOpEndStreamPrimitive)
+      intrin->const_index[0] = w[1];
+
+   nir_builder_instr_insert(&b->nb, &intrin->instr);
+}
+
+static unsigned
+gl_primitive_from_spv_execution_mode(SpvExecutionMode mode)
+{
+   switch (mode) {
+   case SpvExecutionModeInputPoints:
+   case SpvExecutionModeOutputPoints:
+      return 0; /* GL_POINTS */
+   case SpvExecutionModeInputLines:
+      return 1; /* GL_LINES */
+   case SpvExecutionModeInputLinesAdjacency:
+      return 0x000A; /* GL_LINE_STRIP_ADJACENCY_ARB */
+   case SpvExecutionModeTriangles:
+      return 4; /* GL_TRIANGLES */
+   case SpvExecutionModeInputTrianglesAdjacency:
+      return 0x000C; /* GL_TRIANGLES_ADJACENCY_ARB */
+   case SpvExecutionModeQuads:
+      return 7; /* GL_QUADS */
+   case SpvExecutionModeIsolines:
+      return 0x8E7A; /* GL_ISOLINES */
+   case SpvExecutionModeOutputLineStrip:
+      return 3; /* GL_LINE_STRIP */
+   case SpvExecutionModeOutputTriangleStrip:
+      return 5; /* GL_TRIANGLE_STRIP */
+   default:
+      assert(!"Invalid primitive type");
+      return 4;
+   }
+}
+
+static unsigned
+vertices_in_from_spv_execution_mode(SpvExecutionMode mode)
+{
+   switch (mode) {
+   case SpvExecutionModeInputPoints:
+      return 1;
+   case SpvExecutionModeInputLines:
+      return 2;
+   case SpvExecutionModeInputLinesAdjacency:
+      return 4;
+   case SpvExecutionModeTriangles:
+      return 3;
+   case SpvExecutionModeInputTrianglesAdjacency:
+      return 6;
+   default:
+      assert(!"Invalid GS input mode");
+      return 0;
+   }
+}
+
+static gl_shader_stage
+stage_for_execution_model(SpvExecutionModel model)
+{
+   switch (model) {
+   case SpvExecutionModelVertex:
+      return MESA_SHADER_VERTEX;
+   case SpvExecutionModelTessellationControl:
+      return MESA_SHADER_TESS_CTRL;
+   case SpvExecutionModelTessellationEvaluation:
+      return MESA_SHADER_TESS_EVAL;
+   case SpvExecutionModelGeometry:
+      return MESA_SHADER_GEOMETRY;
+   case SpvExecutionModelFragment:
+      return MESA_SHADER_FRAGMENT;
+   case SpvExecutionModelGLCompute:
+      return MESA_SHADER_COMPUTE;
+   default:
+      unreachable("Unsupported execution model");
+   }
+}
+
+static bool
+vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
+                                const uint32_t *w, unsigned count)
+{
+   switch (opcode) {
+   case SpvOpSource:
+   case SpvOpSourceExtension:
+   case SpvOpSourceContinued:
+   case SpvOpExtension:
+      /* Unhandled, but these are for debug so that's ok. */
+      break;
+
+   case SpvOpCapability:
+      switch ((SpvCapability)w[1]) {
+      case SpvCapabilityMatrix:
+      case SpvCapabilityShader:
+      case SpvCapabilityGeometry:
+         break;
+      default:
+         assert(!"Unsupported capability");
+      }
+      break;
+
+   case SpvOpExtInstImport:
+      vtn_handle_extension(b, opcode, w, count);
+      break;
+
+   case SpvOpMemoryModel:
+      assert(w[1] == SpvAddressingModelLogical);
+      assert(w[2] == SpvMemoryModelGLSL450);
+      break;
+
+   case SpvOpEntryPoint: {
+      struct vtn_value *entry_point = &b->values[w[2]];
+      /* Let this be a name label regardless */
+      unsigned name_words;
+      entry_point->name = vtn_string_literal(b, &w[3], count - 3, &name_words);
+
+      if (strcmp(entry_point->name, b->entry_point_name) != 0 ||
+          stage_for_execution_model(w[1]) != b->entry_point_stage)
+         break;
+
+      assert(b->entry_point == NULL);
+      b->entry_point = entry_point;
+      break;
+   }
+
+   case SpvOpString:
+      vtn_push_value(b, w[1], vtn_value_type_string)->str =
+         vtn_string_literal(b, &w[2], count - 2, NULL);
+      break;
+
+   case SpvOpName:
+      b->values[w[1]].name = vtn_string_literal(b, &w[2], count - 2, NULL);
+      break;
+
+   case SpvOpMemberName:
+      /* TODO */
+      break;
+
+   case SpvOpExecutionMode:
+   case SpvOpDecorationGroup:
+   case SpvOpDecorate:
+   case SpvOpMemberDecorate:
+   case SpvOpGroupDecorate:
+   case SpvOpGroupMemberDecorate:
+      vtn_handle_decoration(b, opcode, w, count);
+      break;
+
+   default:
+      return false; /* End of preamble */
+   }
+
+   return true;
+}
+
+static void
+vtn_handle_execution_mode(struct vtn_builder *b, struct vtn_value *entry_point,
+                          const struct vtn_decoration *mode, void *data)
+{
+   assert(b->entry_point == entry_point);
+
+   switch(mode->exec_mode) {
+   case SpvExecutionModeOriginUpperLeft:
+   case SpvExecutionModeOriginLowerLeft:
+      b->origin_upper_left =
+         (mode->exec_mode == SpvExecutionModeOriginUpperLeft);
+      break;
+
+   case SpvExecutionModeEarlyFragmentTests:
+      assert(b->shader->stage == MESA_SHADER_FRAGMENT);
+      b->shader->info.fs.early_fragment_tests = true;
+      break;
+
+   case SpvExecutionModeInvocations:
+      assert(b->shader->stage == MESA_SHADER_GEOMETRY);
+      b->shader->info.gs.invocations = MAX2(1, mode->literals[0]);
+      break;
+
+   case SpvExecutionModeDepthReplacing:
+      assert(b->shader->stage == MESA_SHADER_FRAGMENT);
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_ANY;
+      break;
+   case SpvExecutionModeDepthGreater:
+      assert(b->shader->stage == MESA_SHADER_FRAGMENT);
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_GREATER;
+      break;
+   case SpvExecutionModeDepthLess:
+      assert(b->shader->stage == MESA_SHADER_FRAGMENT);
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_LESS;
+      break;
+   case SpvExecutionModeDepthUnchanged:
+      assert(b->shader->stage == MESA_SHADER_FRAGMENT);
+      b->shader->info.fs.depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
+      break;
+
+   case SpvExecutionModeLocalSize:
+      assert(b->shader->stage == MESA_SHADER_COMPUTE);
+      b->shader->info.cs.local_size[0] = mode->literals[0];
+      b->shader->info.cs.local_size[1] = mode->literals[1];
+      b->shader->info.cs.local_size[2] = mode->literals[2];
+      break;
+   case SpvExecutionModeLocalSizeHint:
+      break; /* Nothing do do with this */
+
+   case SpvExecutionModeOutputVertices:
+      assert(b->shader->stage == MESA_SHADER_GEOMETRY);
+      b->shader->info.gs.vertices_out = mode->literals[0];
+      break;
+
+   case SpvExecutionModeInputPoints:
+   case SpvExecutionModeInputLines:
+   case SpvExecutionModeInputLinesAdjacency:
+   case SpvExecutionModeTriangles:
+   case SpvExecutionModeInputTrianglesAdjacency:
+   case SpvExecutionModeQuads:
+   case SpvExecutionModeIsolines:
+      if (b->shader->stage == MESA_SHADER_GEOMETRY) {
+         b->shader->info.gs.vertices_in =
+            vertices_in_from_spv_execution_mode(mode->exec_mode);
+      } else {
+         assert(!"Tesselation shaders not yet supported");
+      }
+      break;
+
+   case SpvExecutionModeOutputPoints:
+   case SpvExecutionModeOutputLineStrip:
+   case SpvExecutionModeOutputTriangleStrip:
+      assert(b->shader->stage == MESA_SHADER_GEOMETRY);
+      b->shader->info.gs.output_primitive =
+         gl_primitive_from_spv_execution_mode(mode->exec_mode);
+      break;
+
+   case SpvExecutionModeSpacingEqual:
+   case SpvExecutionModeSpacingFractionalEven:
+   case SpvExecutionModeSpacingFractionalOdd:
+   case SpvExecutionModeVertexOrderCw:
+   case SpvExecutionModeVertexOrderCcw:
+   case SpvExecutionModePointMode:
+      assert(!"TODO: Add tessellation metadata");
+      break;
+
+   case SpvExecutionModePixelCenterInteger:
+   case SpvExecutionModeXfb:
+      assert(!"Unhandled execution mode");
+      break;
+
+   case SpvExecutionModeVecTypeHint:
+   case SpvExecutionModeContractionOff:
+      break; /* OpenCL */
+   }
+}
+
+static bool
+vtn_handle_variable_or_type_instruction(struct vtn_builder *b, SpvOp opcode,
+                                        const uint32_t *w, unsigned count)
+{
+   switch (opcode) {
+   case SpvOpSource:
+   case SpvOpSourceContinued:
+   case SpvOpSourceExtension:
+   case SpvOpExtension:
+   case SpvOpCapability:
+   case SpvOpExtInstImport:
+   case SpvOpMemoryModel:
+   case SpvOpEntryPoint:
+   case SpvOpExecutionMode:
+   case SpvOpString:
+   case SpvOpName:
+   case SpvOpMemberName:
+   case SpvOpDecorationGroup:
+   case SpvOpDecorate:
+   case SpvOpMemberDecorate:
+   case SpvOpGroupDecorate:
+   case SpvOpGroupMemberDecorate:
+      assert(!"Invalid opcode types and variables section");
+      break;
+
+   case SpvOpTypeVoid:
+   case SpvOpTypeBool:
+   case SpvOpTypeInt:
+   case SpvOpTypeFloat:
+   case SpvOpTypeVector:
+   case SpvOpTypeMatrix:
+   case SpvOpTypeImage:
+   case SpvOpTypeSampler:
+   case SpvOpTypeSampledImage:
+   case SpvOpTypeArray:
+   case SpvOpTypeRuntimeArray:
+   case SpvOpTypeStruct:
+   case SpvOpTypeOpaque:
+   case SpvOpTypePointer:
+   case SpvOpTypeFunction:
+   case SpvOpTypeEvent:
+   case SpvOpTypeDeviceEvent:
+   case SpvOpTypeReserveId:
+   case SpvOpTypeQueue:
+   case SpvOpTypePipe:
+      vtn_handle_type(b, opcode, w, count);
+      break;
+
+   case SpvOpConstantTrue:
+   case SpvOpConstantFalse:
+   case SpvOpConstant:
+   case SpvOpConstantComposite:
+   case SpvOpConstantSampler:
+   case SpvOpConstantNull:
+   case SpvOpSpecConstantTrue:
+   case SpvOpSpecConstantFalse:
+   case SpvOpSpecConstant:
+   case SpvOpSpecConstantComposite:
+   case SpvOpSpecConstantOp:
+      vtn_handle_constant(b, opcode, w, count);
+      break;
+
+   case SpvOpVariable:
+      vtn_handle_variables(b, opcode, w, count);
+      break;
+
+   default:
+      return false; /* End of preamble */
+   }
+
+   return true;
+}
+
+static bool
+vtn_handle_body_instruction(struct vtn_builder *b, SpvOp opcode,
+                            const uint32_t *w, unsigned count)
+{
+   switch (opcode) {
+   case SpvOpLabel:
+      break;
+
+   case SpvOpLoopMerge:
+   case SpvOpSelectionMerge:
+      /* This is handled by cfg pre-pass and walk_blocks */
+      break;
+
+   case SpvOpUndef: {
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_undef);
+      val->type = vtn_value(b, w[1], vtn_value_type_type)->type;
+      break;
+   }
+
+   case SpvOpExtInst:
+      vtn_handle_extension(b, opcode, w, count);
+      break;
+
+   case SpvOpVariable:
+   case SpvOpLoad:
+   case SpvOpStore:
+   case SpvOpCopyMemory:
+   case SpvOpCopyMemorySized:
+   case SpvOpAccessChain:
+   case SpvOpInBoundsAccessChain:
+   case SpvOpArrayLength:
+      vtn_handle_variables(b, opcode, w, count);
+      break;
+
+   case SpvOpFunctionCall:
+      vtn_handle_function_call(b, opcode, w, count);
+      break;
+
+   case SpvOpSampledImage:
+   case SpvOpImageSampleImplicitLod:
+   case SpvOpImageSampleExplicitLod:
+   case SpvOpImageSampleDrefImplicitLod:
+   case SpvOpImageSampleDrefExplicitLod:
+   case SpvOpImageSampleProjImplicitLod:
+   case SpvOpImageSampleProjExplicitLod:
+   case SpvOpImageSampleProjDrefImplicitLod:
+   case SpvOpImageSampleProjDrefExplicitLod:
+   case SpvOpImageFetch:
+   case SpvOpImageGather:
+   case SpvOpImageDrefGather:
+   case SpvOpImageQuerySizeLod:
+   case SpvOpImageQueryLod:
+   case SpvOpImageQueryLevels:
+   case SpvOpImageQuerySamples:
+      vtn_handle_texture(b, opcode, w, count);
+      break;
+
+   case SpvOpImageRead:
+   case SpvOpImageWrite:
+   case SpvOpImageTexelPointer:
+      vtn_handle_image(b, opcode, w, count);
+      break;
+
+   case SpvOpImageQuerySize: {
+      nir_deref_var *image = vtn_value(b, w[3], vtn_value_type_deref)->deref;
+      const struct glsl_type *image_type = nir_deref_tail(&image->deref)->type;
+      if (glsl_type_is_image(image_type)) {
+         vtn_handle_image(b, opcode, w, count);
+      } else {
+         vtn_handle_texture(b, opcode, w, count);
+      }
+      break;
+   }
+
+   case SpvOpAtomicExchange:
+   case SpvOpAtomicCompareExchange:
+   case SpvOpAtomicCompareExchangeWeak:
+   case SpvOpAtomicIIncrement:
+   case SpvOpAtomicIDecrement:
+   case SpvOpAtomicIAdd:
+   case SpvOpAtomicISub:
+   case SpvOpAtomicSMin:
+   case SpvOpAtomicUMin:
+   case SpvOpAtomicSMax:
+   case SpvOpAtomicUMax:
+   case SpvOpAtomicAnd:
+   case SpvOpAtomicOr:
+   case SpvOpAtomicXor: {
+      struct vtn_value *pointer = vtn_untyped_value(b, w[3]);
+      if (pointer->value_type == vtn_value_type_image_pointer) {
+         vtn_handle_image(b, opcode, w, count);
+      } else {
+         assert(pointer->value_type == vtn_value_type_deref);
+         vtn_handle_ssbo_or_shared_atomic(b, opcode, w, count);
+      }
+      break;
+   }
+
+   case SpvOpSNegate:
+   case SpvOpFNegate:
+   case SpvOpNot:
+   case SpvOpAny:
+   case SpvOpAll:
+   case SpvOpConvertFToU:
+   case SpvOpConvertFToS:
+   case SpvOpConvertSToF:
+   case SpvOpConvertUToF:
+   case SpvOpUConvert:
+   case SpvOpSConvert:
+   case SpvOpFConvert:
+   case SpvOpQuantizeToF16:
+   case SpvOpConvertPtrToU:
+   case SpvOpConvertUToPtr:
+   case SpvOpPtrCastToGeneric:
+   case SpvOpGenericCastToPtr:
+   case SpvOpBitcast:
+   case SpvOpIsNan:
+   case SpvOpIsInf:
+   case SpvOpIsFinite:
+   case SpvOpIsNormal:
+   case SpvOpSignBitSet:
+   case SpvOpLessOrGreater:
+   case SpvOpOrdered:
+   case SpvOpUnordered:
+   case SpvOpIAdd:
+   case SpvOpFAdd:
+   case SpvOpISub:
+   case SpvOpFSub:
+   case SpvOpIMul:
+   case SpvOpFMul:
+   case SpvOpUDiv:
+   case SpvOpSDiv:
+   case SpvOpFDiv:
+   case SpvOpUMod:
+   case SpvOpSRem:
+   case SpvOpSMod:
+   case SpvOpFRem:
+   case SpvOpFMod:
+   case SpvOpVectorTimesScalar:
+   case SpvOpDot:
+   case SpvOpIAddCarry:
+   case SpvOpISubBorrow:
+   case SpvOpUMulExtended:
+   case SpvOpSMulExtended:
+   case SpvOpShiftRightLogical:
+   case SpvOpShiftRightArithmetic:
+   case SpvOpShiftLeftLogical:
+   case SpvOpLogicalEqual:
+   case SpvOpLogicalNotEqual:
+   case SpvOpLogicalOr:
+   case SpvOpLogicalAnd:
+   case SpvOpLogicalNot:
+   case SpvOpBitwiseOr:
+   case SpvOpBitwiseXor:
+   case SpvOpBitwiseAnd:
+   case SpvOpSelect:
+   case SpvOpIEqual:
+   case SpvOpFOrdEqual:
+   case SpvOpFUnordEqual:
+   case SpvOpINotEqual:
+   case SpvOpFOrdNotEqual:
+   case SpvOpFUnordNotEqual:
+   case SpvOpULessThan:
+   case SpvOpSLessThan:
+   case SpvOpFOrdLessThan:
+   case SpvOpFUnordLessThan:
+   case SpvOpUGreaterThan:
+   case SpvOpSGreaterThan:
+   case SpvOpFOrdGreaterThan:
+   case SpvOpFUnordGreaterThan:
+   case SpvOpULessThanEqual:
+   case SpvOpSLessThanEqual:
+   case SpvOpFOrdLessThanEqual:
+   case SpvOpFUnordLessThanEqual:
+   case SpvOpUGreaterThanEqual:
+   case SpvOpSGreaterThanEqual:
+   case SpvOpFOrdGreaterThanEqual:
+   case SpvOpFUnordGreaterThanEqual:
+   case SpvOpDPdx:
+   case SpvOpDPdy:
+   case SpvOpFwidth:
+   case SpvOpDPdxFine:
+   case SpvOpDPdyFine:
+   case SpvOpFwidthFine:
+   case SpvOpDPdxCoarse:
+   case SpvOpDPdyCoarse:
+   case SpvOpFwidthCoarse:
+   case SpvOpBitFieldInsert:
+   case SpvOpBitFieldSExtract:
+   case SpvOpBitFieldUExtract:
+   case SpvOpBitReverse:
+   case SpvOpBitCount:
+   case SpvOpTranspose:
+   case SpvOpOuterProduct:
+   case SpvOpMatrixTimesScalar:
+   case SpvOpVectorTimesMatrix:
+   case SpvOpMatrixTimesVector:
+   case SpvOpMatrixTimesMatrix:
+      vtn_handle_alu(b, opcode, w, count);
+      break;
+
+   case SpvOpVectorExtractDynamic:
+   case SpvOpVectorInsertDynamic:
+   case SpvOpVectorShuffle:
+   case SpvOpCompositeConstruct:
+   case SpvOpCompositeExtract:
+   case SpvOpCompositeInsert:
+   case SpvOpCopyObject:
+      vtn_handle_composite(b, opcode, w, count);
+      break;
+
+   case SpvOpEmitVertex:
+   case SpvOpEndPrimitive:
+   case SpvOpEmitStreamVertex:
+   case SpvOpEndStreamPrimitive:
+   case SpvOpControlBarrier:
+   case SpvOpMemoryBarrier:
+      vtn_handle_barrier(b, opcode, w, count);
+      break;
+
+   default:
+      unreachable("Unhandled opcode");
+   }
+
+   return true;
+}
+
+nir_function *
+spirv_to_nir(const uint32_t *words, size_t word_count,
+             struct nir_spirv_specialization *spec, unsigned num_spec,
+             gl_shader_stage stage, const char *entry_point_name,
+             const nir_shader_compiler_options *options)
+{
+   const uint32_t *word_end = words + word_count;
+
+   /* Handle the SPIR-V header (first 4 dwords)  */
+   assert(word_count > 5);
+
+   assert(words[0] == SpvMagicNumber);
+   assert(words[1] >= 0x10000);
+   /* words[2] == generator magic */
+   unsigned value_id_bound = words[3];
+   assert(words[4] == 0);
+
+   words+= 5;
+
+   /* Initialize the stn_builder object */
+   struct vtn_builder *b = rzalloc(NULL, struct vtn_builder);
+   b->value_id_bound = value_id_bound;
+   b->values = rzalloc_array(b, struct vtn_value, value_id_bound);
+   exec_list_make_empty(&b->functions);
+   b->entry_point_stage = stage;
+   b->entry_point_name = entry_point_name;
+
+   /* Handle all the preamble instructions */
+   words = vtn_foreach_instruction(b, words, word_end,
+                                   vtn_handle_preamble_instruction);
+
+   if (b->entry_point == NULL) {
+      assert(!"Entry point not found");
+      ralloc_free(b);
+      return NULL;
+   }
+
+   b->shader = nir_shader_create(NULL, stage, options);
+
+   /* Parse execution modes */
+   vtn_foreach_execution_mode(b, b->entry_point,
+                              vtn_handle_execution_mode, NULL);
+
+   b->specializations = spec;
+   b->num_specializations = num_spec;
+
+   /* Handle all variable, type, and constant instructions */
+   words = vtn_foreach_instruction(b, words, word_end,
+                                   vtn_handle_variable_or_type_instruction);
+
+   vtn_build_cfg(b, words, word_end);
+
+   foreach_list_typed(struct vtn_function, func, node, &b->functions) {
+      b->impl = func->impl;
+      b->const_table = _mesa_hash_table_create(b, _mesa_hash_pointer,
+                                               _mesa_key_pointer_equal);
+
+      vtn_function_emit(b, func, vtn_handle_body_instruction);
+   }
+
+   assert(b->entry_point->value_type == vtn_value_type_function);
+   nir_function *entry_point = b->entry_point->func->impl->function;
+   assert(entry_point);
+
+   ralloc_free(b);
+
+   return entry_point;
+}
diff --git a/src/glsl/nir/spirv/vtn_alu.c b/src/glsl/nir/spirv/vtn_alu.c

new file mode 100644 (file)

index 0000000..d866da7
--- /dev/null
+++ b/src/glsl/nir/spirv/vtn_alu.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vtn_private.h"
+
+/*
+ * Normally, column vectors in SPIR-V correspond to a single NIR SSA
+ * definition. But for matrix multiplies, we want to do one routine for
+ * multiplying a matrix by a matrix and then pretend that vectors are matrices
+ * with one column. So we "wrap" these things, and unwrap the result before we
+ * send it off.
+ */
+
+static struct vtn_ssa_value *
+wrap_matrix(struct vtn_builder *b, struct vtn_ssa_value *val)
+{
+   if (val == NULL)
+      return NULL;
+
+   if (glsl_type_is_matrix(val->type))
+      return val;
+
+   struct vtn_ssa_value *dest = rzalloc(b, struct vtn_ssa_value);
+   dest->type = val->type;
+   dest->elems = ralloc_array(b, struct vtn_ssa_value *, 1);
+   dest->elems[0] = val;
+
+   return dest;
+}
+
+static struct vtn_ssa_value *
+unwrap_matrix(struct vtn_ssa_value *val)
+{
+   if (glsl_type_is_matrix(val->type))
+         return val;
+
+   return val->elems[0];
+}
+
+static struct vtn_ssa_value *
+matrix_multiply(struct vtn_builder *b,
+                struct vtn_ssa_value *_src0, struct vtn_ssa_value *_src1)
+{
+
+   struct vtn_ssa_value *src0 = wrap_matrix(b, _src0);
+   struct vtn_ssa_value *src1 = wrap_matrix(b, _src1);
+   struct vtn_ssa_value *src0_transpose = wrap_matrix(b, _src0->transposed);
+   struct vtn_ssa_value *src1_transpose = wrap_matrix(b, _src1->transposed);
+
+   unsigned src0_rows = glsl_get_vector_elements(src0->type);
+   unsigned src0_columns = glsl_get_matrix_columns(src0->type);
+   unsigned src1_columns = glsl_get_matrix_columns(src1->type);
+
+   const struct glsl_type *dest_type;
+   if (src1_columns > 1) {
+      dest_type = glsl_matrix_type(glsl_get_base_type(src0->type),
+                                   src0_rows, src1_columns);
+   } else {
+      dest_type = glsl_vector_type(glsl_get_base_type(src0->type), src0_rows);
+   }
+   struct vtn_ssa_value *dest = vtn_create_ssa_value(b, dest_type);
+
+   dest = wrap_matrix(b, dest);
+
+   bool transpose_result = false;
+   if (src0_transpose && src1_transpose) {
+      /* transpose(A) * transpose(B) = transpose(B * A) */
+      src1 = src0_transpose;
+      src0 = src1_transpose;
+      src0_transpose = NULL;
+      src1_transpose = NULL;
+      transpose_result = true;
+   }
+
+   if (src0_transpose && !src1_transpose &&
+       glsl_get_base_type(src0->type) == GLSL_TYPE_FLOAT) {
+      /* We already have the rows of src0 and the columns of src1 available,
+       * so we can just take the dot product of each row with each column to
+       * get the result.
+       */
+
+      for (unsigned i = 0; i < src1_columns; i++) {
+         nir_ssa_def *vec_src[4];
+         for (unsigned j = 0; j < src0_rows; j++) {
+            vec_src[j] = nir_fdot(&b->nb, src0_transpose->elems[j]->def,
+                                          src1->elems[i]->def);
+         }
+         dest->elems[i]->def = nir_vec(&b->nb, vec_src, src0_rows);
+      }
+   } else {
+      /* We don't handle the case where src1 is transposed but not src0, since
+       * the general case only uses individual components of src1 so the
+       * optimizer should chew through the transpose we emitted for src1.
+       */
+
+      for (unsigned i = 0; i < src1_columns; i++) {
+         /* dest[i] = sum(src0[j] * src1[i][j] for all j) */
+         dest->elems[i]->def =
+            nir_fmul(&b->nb, src0->elems[0]->def,
+                     nir_channel(&b->nb, src1->elems[i]->def, 0));
+         for (unsigned j = 1; j < src0_columns; j++) {
+            dest->elems[i]->def =
+               nir_fadd(&b->nb, dest->elems[i]->def,
+                        nir_fmul(&b->nb, src0->elems[j]->def,
+                                 nir_channel(&b->nb, src1->elems[i]->def, j)));
+         }
+      }
+   }
+
+   dest = unwrap_matrix(dest);
+
+   if (transpose_result)
+      dest = vtn_ssa_transpose(b, dest);
+
+   return dest;
+}
+
+static struct vtn_ssa_value *
+mat_times_scalar(struct vtn_builder *b,
+                 struct vtn_ssa_value *mat,
+                 nir_ssa_def *scalar)
+{
+   struct vtn_ssa_value *dest = vtn_create_ssa_value(b, mat->type);
+   for (unsigned i = 0; i < glsl_get_matrix_columns(mat->type); i++) {
+      if (glsl_get_base_type(mat->type) == GLSL_TYPE_FLOAT)
+         dest->elems[i]->def = nir_fmul(&b->nb, mat->elems[i]->def, scalar);
+      else
+         dest->elems[i]->def = nir_imul(&b->nb, mat->elems[i]->def, scalar);
+   }
+
+   return dest;
+}
+
+static void
+vtn_handle_matrix_alu(struct vtn_builder *b, SpvOp opcode,
+                      struct vtn_value *dest,
+                      struct vtn_ssa_value *src0, struct vtn_ssa_value *src1)
+{
+   switch (opcode) {
+   case SpvOpFNegate: {
+      dest->ssa = vtn_create_ssa_value(b, src0->type);
+      unsigned cols = glsl_get_matrix_columns(src0->type);
+      for (unsigned i = 0; i < cols; i++)
+         dest->ssa->elems[i]->def = nir_fneg(&b->nb, src0->elems[i]->def);
+      break;
+   }
+
+   case SpvOpFAdd: {
+      dest->ssa = vtn_create_ssa_value(b, src0->type);
+      unsigned cols = glsl_get_matrix_columns(src0->type);
+      for (unsigned i = 0; i < cols; i++)
+         dest->ssa->elems[i]->def =
+            nir_fadd(&b->nb, src0->elems[i]->def, src1->elems[i]->def);
+      break;
+   }
+
+   case SpvOpFSub: {
+      dest->ssa = vtn_create_ssa_value(b, src0->type);
+      unsigned cols = glsl_get_matrix_columns(src0->type);
+      for (unsigned i = 0; i < cols; i++)
+         dest->ssa->elems[i]->def =
+            nir_fsub(&b->nb, src0->elems[i]->def, src1->elems[i]->def);
+      break;
+   }
+
+   case SpvOpTranspose:
+      dest->ssa = vtn_ssa_transpose(b, src0);
+      break;
+
+   case SpvOpMatrixTimesScalar:
+      if (src0->transposed) {
+         dest->ssa = vtn_ssa_transpose(b, mat_times_scalar(b, src0->transposed,
+                                                           src1->def));
+      } else {
+         dest->ssa = mat_times_scalar(b, src0, src1->def);
+      }
+      break;
+
+   case SpvOpVectorTimesMatrix:
+   case SpvOpMatrixTimesVector:
+   case SpvOpMatrixTimesMatrix:
+      if (opcode == SpvOpVectorTimesMatrix) {
+         dest->ssa = matrix_multiply(b, vtn_ssa_transpose(b, src1), src0);
+      } else {
+         dest->ssa = matrix_multiply(b, src0, src1);
+      }
+      break;
+
+   default: unreachable("unknown matrix opcode");
+   }
+}
+
+nir_op
+vtn_nir_alu_op_for_spirv_opcode(SpvOp opcode, bool *swap)
+{
+   /* Indicates that the first two arguments should be swapped.  This is
+    * used for implementing greater-than and less-than-or-equal.
+    */
+   *swap = false;
+
+   switch (opcode) {
+   case SpvOpSNegate:            return nir_op_ineg;
+   case SpvOpFNegate:            return nir_op_fneg;
+   case SpvOpNot:                return nir_op_inot;
+   case SpvOpIAdd:               return nir_op_iadd;
+   case SpvOpFAdd:               return nir_op_fadd;
+   case SpvOpISub:               return nir_op_isub;
+   case SpvOpFSub:               return nir_op_fsub;
+   case SpvOpIMul:               return nir_op_imul;
+   case SpvOpFMul:               return nir_op_fmul;
+   case SpvOpUDiv:               return nir_op_udiv;
+   case SpvOpSDiv:               return nir_op_idiv;
+   case SpvOpFDiv:               return nir_op_fdiv;
+   case SpvOpUMod:               return nir_op_umod;
+   case SpvOpSMod:               return nir_op_imod;
+   case SpvOpFMod:               return nir_op_fmod;
+   case SpvOpSRem:               return nir_op_irem;
+   case SpvOpFRem:               return nir_op_frem;
+
+   case SpvOpShiftRightLogical:     return nir_op_ushr;
+   case SpvOpShiftRightArithmetic:  return nir_op_ishr;
+   case SpvOpShiftLeftLogical:      return nir_op_ishl;
+   case SpvOpLogicalOr:             return nir_op_ior;
+   case SpvOpLogicalEqual:          return nir_op_ieq;
+   case SpvOpLogicalNotEqual:       return nir_op_ine;
+   case SpvOpLogicalAnd:            return nir_op_iand;
+   case SpvOpLogicalNot:            return nir_op_inot;
+   case SpvOpBitwiseOr:             return nir_op_ior;
+   case SpvOpBitwiseXor:            return nir_op_ixor;
+   case SpvOpBitwiseAnd:            return nir_op_iand;
+   case SpvOpSelect:                return nir_op_bcsel;
+   case SpvOpIEqual:                return nir_op_ieq;
+
+   case SpvOpBitFieldInsert:        return nir_op_bitfield_insert;
+   case SpvOpBitFieldSExtract:      return nir_op_ibitfield_extract;
+   case SpvOpBitFieldUExtract:      return nir_op_ubitfield_extract;
+   case SpvOpBitReverse:            return nir_op_bitfield_reverse;
+   case SpvOpBitCount:              return nir_op_bit_count;
+
+   /* Comparisons: (TODO: How do we want to handled ordered/unordered?) */
+   case SpvOpFOrdEqual:                            return nir_op_feq;
+   case SpvOpFUnordEqual:                          return nir_op_feq;
+   case SpvOpINotEqual:                            return nir_op_ine;
+   case SpvOpFOrdNotEqual:                         return nir_op_fne;
+   case SpvOpFUnordNotEqual:                       return nir_op_fne;
+   case SpvOpULessThan:                            return nir_op_ult;
+   case SpvOpSLessThan:                            return nir_op_ilt;
+   case SpvOpFOrdLessThan:                         return nir_op_flt;
+   case SpvOpFUnordLessThan:                       return nir_op_flt;
+   case SpvOpUGreaterThan:          *swap = true;  return nir_op_ult;
+   case SpvOpSGreaterThan:          *swap = true;  return nir_op_ilt;
+   case SpvOpFOrdGreaterThan:       *swap = true;  return nir_op_flt;
+   case SpvOpFUnordGreaterThan:     *swap = true;  return nir_op_flt;
+   case SpvOpULessThanEqual:        *swap = true;  return nir_op_uge;
+   case SpvOpSLessThanEqual:        *swap = true;  return nir_op_ige;
+   case SpvOpFOrdLessThanEqual:     *swap = true;  return nir_op_fge;
+   case SpvOpFUnordLessThanEqual:   *swap = true;  return nir_op_fge;
+   case SpvOpUGreaterThanEqual:                    return nir_op_uge;
+   case SpvOpSGreaterThanEqual:                    return nir_op_ige;
+   case SpvOpFOrdGreaterThanEqual:                 return nir_op_fge;
+   case SpvOpFUnordGreaterThanEqual:               return nir_op_fge;
+
+   /* Conversions: */
+   case SpvOpConvertFToU:           return nir_op_f2u;
+   case SpvOpConvertFToS:           return nir_op_f2i;
+   case SpvOpConvertSToF:           return nir_op_i2f;
+   case SpvOpConvertUToF:           return nir_op_u2f;
+   case SpvOpBitcast:               return nir_op_imov;
+   case SpvOpUConvert:
+   case SpvOpQuantizeToF16:         return nir_op_fquantize2f16;
+   /* TODO: NIR is 32-bit only; these are no-ops. */
+   case SpvOpSConvert:              return nir_op_imov;
+   case SpvOpFConvert:              return nir_op_fmov;
+
+   /* Derivatives: */
+   case SpvOpDPdx:         return nir_op_fddx;
+   case SpvOpDPdy:         return nir_op_fddy;
+   case SpvOpDPdxFine:     return nir_op_fddx_fine;
+   case SpvOpDPdyFine:     return nir_op_fddy_fine;
+   case SpvOpDPdxCoarse:   return nir_op_fddx_coarse;
+   case SpvOpDPdyCoarse:   return nir_op_fddy_coarse;
+
+   default:
+      unreachable("No NIR equivalent");
+   }
+}
+
+void
+vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
+               const uint32_t *w, unsigned count)
+{
+   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+   const struct glsl_type *type =
+      vtn_value(b, w[1], vtn_value_type_type)->type->type;
+
+   /* Collect the various SSA sources */
+   const unsigned num_inputs = count - 3;
+   struct vtn_ssa_value *vtn_src[4] = { NULL, };
+   for (unsigned i = 0; i < num_inputs; i++)
+      vtn_src[i] = vtn_ssa_value(b, w[i + 3]);
+
+   if (glsl_type_is_matrix(vtn_src[0]->type) ||
+       (num_inputs >= 2 && glsl_type_is_matrix(vtn_src[1]->type))) {
+      vtn_handle_matrix_alu(b, opcode, val, vtn_src[0], vtn_src[1]);
+      return;
+   }
+
+   val->ssa = vtn_create_ssa_value(b, type);
+   nir_ssa_def *src[4] = { NULL, };
+   for (unsigned i = 0; i < num_inputs; i++) {
+      assert(glsl_type_is_vector_or_scalar(vtn_src[i]->type));
+      src[i] = vtn_src[i]->def;
+   }
+
+   switch (opcode) {
+   case SpvOpAny:
+      if (src[0]->num_components == 1) {
+         val->ssa->def = nir_imov(&b->nb, src[0]);
+      } else {
+         nir_op op;
+         switch (src[0]->num_components) {
+         case 2:  op = nir_op_bany_inequal2; break;
+         case 3:  op = nir_op_bany_inequal3; break;
+         case 4:  op = nir_op_bany_inequal4; break;
+         }
+         val->ssa->def = nir_build_alu(&b->nb, op, src[0],
+                                       nir_imm_int(&b->nb, NIR_FALSE),
+                                       NULL, NULL);
+      }
+      return;
+
+   case SpvOpAll:
+      if (src[0]->num_components == 1) {
+         val->ssa->def = nir_imov(&b->nb, src[0]);
+      } else {
+         nir_op op;
+         switch (src[0]->num_components) {
+         case 2:  op = nir_op_ball_iequal2;  break;
+         case 3:  op = nir_op_ball_iequal3;  break;
+         case 4:  op = nir_op_ball_iequal4;  break;
+         }
+         val->ssa->def = nir_build_alu(&b->nb, op, src[0],
+                                       nir_imm_int(&b->nb, NIR_TRUE),
+                                       NULL, NULL);
+      }
+      return;
+
+   case SpvOpOuterProduct: {
+      for (unsigned i = 0; i < src[1]->num_components; i++) {
+         val->ssa->elems[i]->def =
+            nir_fmul(&b->nb, src[0], nir_channel(&b->nb, src[1], i));
+      }
+      return;
+   }
+
+   case SpvOpDot:
+      val->ssa->def = nir_fdot(&b->nb, src[0], src[1]);
+      return;
+
+   case SpvOpIAddCarry:
+      assert(glsl_type_is_struct(val->ssa->type));
+      val->ssa->elems[0]->def = nir_iadd(&b->nb, src[0], src[1]);
+      val->ssa->elems[1]->def = nir_uadd_carry(&b->nb, src[0], src[1]);
+      return;
+
+   case SpvOpISubBorrow:
+      assert(glsl_type_is_struct(val->ssa->type));
+      val->ssa->elems[0]->def = nir_isub(&b->nb, src[0], src[1]);
+      val->ssa->elems[1]->def = nir_usub_borrow(&b->nb, src[0], src[1]);
+      return;
+
+   case SpvOpUMulExtended:
+      assert(glsl_type_is_struct(val->ssa->type));
+      val->ssa->elems[0]->def = nir_imul(&b->nb, src[0], src[1]);
+      val->ssa->elems[1]->def = nir_umul_high(&b->nb, src[0], src[1]);
+      return;
+
+   case SpvOpSMulExtended:
+      assert(glsl_type_is_struct(val->ssa->type));
+      val->ssa->elems[0]->def = nir_imul(&b->nb, src[0], src[1]);
+      val->ssa->elems[1]->def = nir_imul_high(&b->nb, src[0], src[1]);
+      return;
+
+   case SpvOpFwidth:
+      val->ssa->def = nir_fadd(&b->nb,
+                               nir_fabs(&b->nb, nir_fddx(&b->nb, src[0])),
+                               nir_fabs(&b->nb, nir_fddx(&b->nb, src[1])));
+      return;
+   case SpvOpFwidthFine:
+      val->ssa->def = nir_fadd(&b->nb,
+                               nir_fabs(&b->nb, nir_fddx_fine(&b->nb, src[0])),
+                               nir_fabs(&b->nb, nir_fddx_fine(&b->nb, src[1])));
+      return;
+   case SpvOpFwidthCoarse:
+      val->ssa->def = nir_fadd(&b->nb,
+                               nir_fabs(&b->nb, nir_fddx_coarse(&b->nb, src[0])),
+                               nir_fabs(&b->nb, nir_fddx_coarse(&b->nb, src[1])));
+      return;
+
+   case SpvOpVectorTimesScalar:
+      /* The builder will take care of splatting for us. */
+      val->ssa->def = nir_fmul(&b->nb, src[0], src[1]);
+      return;
+
+   case SpvOpIsNan:
+      val->ssa->def = nir_fne(&b->nb, src[0], src[0]);
+      return;
+
+   case SpvOpIsInf:
+      val->ssa->def = nir_feq(&b->nb, nir_fabs(&b->nb, src[0]),
+                                      nir_imm_float(&b->nb, INFINITY));
+      return;
+
+   default: {
+      bool swap;
+      nir_op op = vtn_nir_alu_op_for_spirv_opcode(opcode, &swap);
+
+      if (swap) {
+         nir_ssa_def *tmp = src[0];
+         src[0] = src[1];
+         src[1] = tmp;
+      }
+
+      val->ssa->def = nir_build_alu(&b->nb, op, src[0], src[1], src[2], src[3]);
+      return;
+   } /* default */
+   }
+}
diff --git a/src/glsl/nir/spirv/vtn_cfg.c b/src/glsl/nir/spirv/vtn_cfg.c

new file mode 100644 (file)

index 0000000..0d3702c
--- /dev/null
+++ b/src/glsl/nir/spirv/vtn_cfg.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vtn_private.h"
+#include "nir/nir_vla.h"
+
+static bool
+vtn_cfg_handle_prepass_instruction(struct vtn_builder *b, SpvOp opcode,
+                                   const uint32_t *w, unsigned count)
+{
+   switch (opcode) {
+   case SpvOpFunction: {
+      assert(b->func == NULL);
+      b->func = rzalloc(b, struct vtn_function);
+
+      list_inithead(&b->func->body);
+      b->func->control = w[3];
+
+      const struct glsl_type *result_type =
+         vtn_value(b, w[1], vtn_value_type_type)->type->type;
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_function);
+      val->func = b->func;
+
+      const struct glsl_type *func_type =
+         vtn_value(b, w[4], vtn_value_type_type)->type->type;
+
+      assert(glsl_get_function_return_type(func_type) == result_type);
+
+      nir_function *func =
+         nir_function_create(b->shader, ralloc_strdup(b->shader, val->name));
+
+      func->num_params = glsl_get_length(func_type);
+      func->params = ralloc_array(b->shader, nir_parameter, func->num_params);
+      for (unsigned i = 0; i < func->num_params; i++) {
+         const struct glsl_function_param *param =
+            glsl_get_function_param(func_type, i);
+         func->params[i].type = param->type;
+         if (param->in) {
+            if (param->out) {
+               func->params[i].param_type = nir_parameter_inout;
+            } else {
+               func->params[i].param_type = nir_parameter_in;
+            }
+         } else {
+            if (param->out) {
+               func->params[i].param_type = nir_parameter_out;
+            } else {
+               assert(!"Parameter is neither in nor out");
+            }
+         }
+      }
+
+      func->return_type = glsl_get_function_return_type(func_type);
+
+      b->func->impl = nir_function_impl_create(func);
+      if (!glsl_type_is_void(func->return_type)) {
+         b->func->impl->return_var =
+            nir_local_variable_create(b->func->impl, func->return_type, "ret");
+      }
+
+      b->func_param_idx = 0;
+      break;
+   }
+
+   case SpvOpFunctionEnd:
+      b->func->end = w;
+      b->func = NULL;
+      break;
+
+   case SpvOpFunctionParameter: {
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_deref);
+
+      assert(b->func_param_idx < b->func->impl->num_params);
+      unsigned idx = b->func_param_idx++;
+
+      nir_variable *param =
+         nir_local_variable_create(b->func->impl,
+                                   b->func->impl->function->params[idx].type,
+                                   val->name);
+
+      b->func->impl->params[idx] = param;
+      val->deref = nir_deref_var_create(b, param);
+      val->deref_type = vtn_value(b, w[1], vtn_value_type_type)->type;
+      break;
+   }
+
+   case SpvOpLabel: {
+      assert(b->block == NULL);
+      b->block = rzalloc(b, struct vtn_block);
+      b->block->node.type = vtn_cf_node_type_block;
+      b->block->label = w;
+      vtn_push_value(b, w[1], vtn_value_type_block)->block = b->block;
+
+      if (b->func->start_block == NULL) {
+         /* This is the first block encountered for this function.  In this
+          * case, we set the start block and add it to the list of
+          * implemented functions that we'll walk later.
+          */
+         b->func->start_block = b->block;
+         exec_list_push_tail(&b->functions, &b->func->node);
+      }
+      break;
+   }
+
+   case SpvOpSelectionMerge:
+   case SpvOpLoopMerge:
+      assert(b->block && b->block->merge == NULL);
+      b->block->merge = w;
+      break;
+
+   case SpvOpBranch:
+   case SpvOpBranchConditional:
+   case SpvOpSwitch:
+   case SpvOpKill:
+   case SpvOpReturn:
+   case SpvOpReturnValue:
+   case SpvOpUnreachable:
+      assert(b->block && b->block->branch == NULL);
+      b->block->branch = w;
+      b->block = NULL;
+      break;
+
+   default:
+      /* Continue on as per normal */
+      return true;
+   }
+
+   return true;
+}
+
+static void
+vtn_add_case(struct vtn_builder *b, struct vtn_switch *swtch,
+             struct vtn_block *break_block,
+             uint32_t block_id, uint32_t val, bool is_default)
+{
+   struct vtn_block *case_block =
+      vtn_value(b, block_id, vtn_value_type_block)->block;
+
+   /* Don't create dummy cases that just break */
+   if (case_block == break_block)
+      return;
+
+   if (case_block->switch_case == NULL) {
+      struct vtn_case *c = ralloc(b, struct vtn_case);
+
+      list_inithead(&c->body);
+      c->start_block = case_block;
+      c->fallthrough = NULL;
+      nir_array_init(&c->values, b);
+      c->is_default = false;
+      c->visited = false;
+
+      list_addtail(&c->link, &swtch->cases);
+
+      case_block->switch_case = c;
+   }
+
+   if (is_default) {
+      case_block->switch_case->is_default = true;
+   } else {
+      nir_array_add(&case_block->switch_case->values, uint32_t, val);
+   }
+}
+
+/* This function performs a depth-first search of the cases and puts them
+ * in fall-through order.
+ */
+static void
+vtn_order_case(struct vtn_switch *swtch, struct vtn_case *cse)
+{
+   if (cse->visited)
+      return;
+
+   cse->visited = true;
+
+   list_del(&cse->link);
+
+   if (cse->fallthrough) {
+      vtn_order_case(swtch, cse->fallthrough);
+
+      /* If we have a fall-through, place this case right before the case it
+       * falls through to.  This ensures that fallthroughs come one after
+       * the other.  These two can never get separated because that would
+       * imply something else falling through to the same case.  Also, this
+       * can't break ordering because the DFS ensures that this case is
+       * visited before anything that falls through to it.
+       */
+      list_addtail(&cse->link, &cse->fallthrough->link);
+   } else {
+      list_add(&cse->link, &swtch->cases);
+   }
+}
+
+static enum vtn_branch_type
+vtn_get_branch_type(struct vtn_block *block,
+                    struct vtn_case *swcase, struct vtn_block *switch_break,
+                    struct vtn_block *loop_break, struct vtn_block *loop_cont)
+{
+   if (block->switch_case) {
+      /* This branch is actually a fallthrough */
+      assert(swcase->fallthrough == NULL ||
+             swcase->fallthrough == block->switch_case);
+      swcase->fallthrough = block->switch_case;
+      return vtn_branch_type_switch_fallthrough;
+   } else if (block == switch_break) {
+      return vtn_branch_type_switch_break;
+   } else if (block == loop_break) {
+      return vtn_branch_type_loop_break;
+   } else if (block == loop_cont) {
+      return vtn_branch_type_loop_continue;
+   } else {
+      return vtn_branch_type_none;
+   }
+}
+
+static void
+vtn_cfg_walk_blocks(struct vtn_builder *b, struct list_head *cf_list,
+                    struct vtn_block *start, struct vtn_case *switch_case,
+                    struct vtn_block *switch_break,
+                    struct vtn_block *loop_break, struct vtn_block *loop_cont,
+                    struct vtn_block *end)
+{
+   struct vtn_block *block = start;
+   while (block != end) {
+      if (block->merge && (*block->merge & SpvOpCodeMask) == SpvOpLoopMerge &&
+          !block->loop) {
+         struct vtn_loop *loop = ralloc(b, struct vtn_loop);
+
+         loop->node.type = vtn_cf_node_type_loop;
+         list_inithead(&loop->body);
+         list_inithead(&loop->cont_body);
+         loop->control = block->merge[3];
+
+         list_addtail(&loop->node.link, cf_list);
+         block->loop = loop;
+
+         struct vtn_block *new_loop_break =
+            vtn_value(b, block->merge[1], vtn_value_type_block)->block;
+         struct vtn_block *new_loop_cont =
+            vtn_value(b, block->merge[2], vtn_value_type_block)->block;
+
+         /* Note: This recursive call will start with the current block as
+          * its start block.  If we weren't careful, we would get here
+          * again and end up in infinite recursion.  This is why we set
+          * block->loop above and check for it before creating one.  This
+          * way, we only create the loop once and the second call that
+          * tries to handle this loop goes to the cases below and gets
+          * handled as a regular block.
+          *
+          * Note: When we make the recursive walk calls, we pass NULL for
+          * the switch break since you have to break out of the loop first.
+          * We do, however, still pass the current switch case because it's
+          * possible that the merge block for the loop is the start of
+          * another case.
+          */
+         vtn_cfg_walk_blocks(b, &loop->body, block, switch_case, NULL,
+                             new_loop_break, new_loop_cont, NULL );
+         vtn_cfg_walk_blocks(b, &loop->cont_body, new_loop_cont, NULL, NULL,
+                             new_loop_break, NULL, block);
+
+         block = new_loop_break;
+         continue;
+      }
+
+      assert(block->node.link.next == NULL);
+      list_addtail(&block->node.link, cf_list);
+
+      switch (*block->branch & SpvOpCodeMask) {
+      case SpvOpBranch: {
+         struct vtn_block *branch_block =
+            vtn_value(b, block->branch[1], vtn_value_type_block)->block;
+
+         block->branch_type = vtn_get_branch_type(branch_block,
+                                                  switch_case, switch_break,
+                                                  loop_break, loop_cont);
+
+         if (block->branch_type != vtn_branch_type_none)
+            return;
+
+         block = branch_block;
+         continue;
+      }
+
+      case SpvOpReturn:
+      case SpvOpReturnValue:
+         block->branch_type = vtn_branch_type_return;
+         return;
+
+      case SpvOpKill:
+         block->branch_type = vtn_branch_type_discard;
+         return;
+
+      case SpvOpBranchConditional: {
+         struct vtn_block *then_block =
+            vtn_value(b, block->branch[2], vtn_value_type_block)->block;
+         struct vtn_block *else_block =
+            vtn_value(b, block->branch[3], vtn_value_type_block)->block;
+
+         struct vtn_if *if_stmt = ralloc(b, struct vtn_if);
+
+         if_stmt->node.type = vtn_cf_node_type_if;
+         if_stmt->condition = block->branch[1];
+         list_inithead(&if_stmt->then_body);
+         list_inithead(&if_stmt->else_body);
+
+         list_addtail(&if_stmt->node.link, cf_list);
+
+         if (block->merge &&
+             (*block->merge & SpvOpCodeMask) == SpvOpSelectionMerge) {
+            if_stmt->control = block->merge[2];
+         }
+
+         if_stmt->then_type = vtn_get_branch_type(then_block,
+                                                  switch_case, switch_break,
+                                                  loop_break, loop_cont);
+         if_stmt->else_type = vtn_get_branch_type(else_block,
+                                                  switch_case, switch_break,
+                                                  loop_break, loop_cont);
+
+         if (if_stmt->then_type == vtn_branch_type_none &&
+             if_stmt->else_type == vtn_branch_type_none) {
+            /* Neither side of the if is something we can short-circuit. */
+            assert((*block->merge & SpvOpCodeMask) == SpvOpSelectionMerge);
+            struct vtn_block *merge_block =
+               vtn_value(b, block->merge[1], vtn_value_type_block)->block;
+
+            vtn_cfg_walk_blocks(b, &if_stmt->then_body, then_block,
+                                switch_case, switch_break,
+                                loop_break, loop_cont, merge_block);
+            vtn_cfg_walk_blocks(b, &if_stmt->else_body, else_block,
+                                switch_case, switch_break,
+                                loop_break, loop_cont, merge_block);
+
+            enum vtn_branch_type merge_type =
+               vtn_get_branch_type(merge_block, switch_case, switch_break,
+                                   loop_break, loop_cont);
+            if (merge_type == vtn_branch_type_none) {
+               block = merge_block;
+               continue;
+            } else {
+               return;
+            }
+         } else if (if_stmt->then_type != vtn_branch_type_none &&
+                    if_stmt->else_type != vtn_branch_type_none) {
+            /* Both sides were short-circuited.  We're done here. */
+            return;
+         } else {
+            /* Exeactly one side of the branch could be short-circuited.
+             * We set the branch up as a predicated break/continue and we
+             * continue on with the other side as if it were what comes
+             * after the if.
+             */
+            if (if_stmt->then_type == vtn_branch_type_none) {
+               block = then_block;
+            } else {
+               block = else_block;
+            }
+            continue;
+         }
+         unreachable("Should have returned or continued");
+      }
+
+      case SpvOpSwitch: {
+         assert((*block->merge & SpvOpCodeMask) == SpvOpSelectionMerge);
+         struct vtn_block *break_block =
+            vtn_value(b, block->merge[1], vtn_value_type_block)->block;
+
+         struct vtn_switch *swtch = ralloc(b, struct vtn_switch);
+
+         swtch->node.type = vtn_cf_node_type_switch;
+         swtch->selector = block->branch[1];
+         list_inithead(&swtch->cases);
+
+         list_addtail(&swtch->node.link, cf_list);
+
+         /* First, we go through and record all of the cases. */
+         const uint32_t *branch_end =
+            block->branch + (block->branch[0] >> SpvWordCountShift);
+
+         vtn_add_case(b, swtch, break_block, block->branch[2], 0, true);
+         for (const uint32_t *w = block->branch + 3; w < branch_end; w += 2)
+            vtn_add_case(b, swtch, break_block, w[1], w[0], false);
+
+         /* Now, we go through and walk the blocks.  While we walk through
+          * the blocks, we also gather the much-needed fall-through
+          * information.
+          */
+         list_for_each_entry(struct vtn_case, cse, &swtch->cases, link) {
+            assert(cse->start_block != break_block);
+            vtn_cfg_walk_blocks(b, &cse->body, cse->start_block, cse,
+                                break_block, NULL, loop_cont, NULL);
+         }
+
+         /* Finally, we walk over all of the cases one more time and put
+          * them in fall-through order.
+          */
+         for (const uint32_t *w = block->branch + 2; w < branch_end; w += 2) {
+            struct vtn_block *case_block =
+               vtn_value(b, *w, vtn_value_type_block)->block;
+
+            if (case_block == break_block)
+               continue;
+
+            assert(case_block->switch_case);
+
+            vtn_order_case(swtch, case_block->switch_case);
+         }
+
+         block = break_block;
+         continue;
+      }
+
+      case SpvOpUnreachable:
+         return;
+
+      default:
+         unreachable("Unhandled opcode");
+      }
+   }
+}
+
+void
+vtn_build_cfg(struct vtn_builder *b, const uint32_t *words, const uint32_t *end)
+{
+   vtn_foreach_instruction(b, words, end,
+                           vtn_cfg_handle_prepass_instruction);
+
+   foreach_list_typed(struct vtn_function, func, node, &b->functions) {
+      vtn_cfg_walk_blocks(b, &func->body, func->start_block,
+                          NULL, NULL, NULL, NULL, NULL);
+   }
+}
+
+static bool
+vtn_handle_phis_first_pass(struct vtn_builder *b, SpvOp opcode,
+                           const uint32_t *w, unsigned count)
+{
+   if (opcode == SpvOpLabel)
+      return true; /* Nothing to do */
+
+   /* If this isn't a phi node, stop. */
+   if (opcode != SpvOpPhi)
+      return false;
+
+   /* For handling phi nodes, we do a poor-man's out-of-ssa on the spot.
+    * For each phi, we create a variable with the appropreate type and
+    * do a load from that variable.  Then, in a second pass, we add
+    * stores to that variable to each of the predecessor blocks.
+    *
+    * We could do something more intelligent here.  However, in order to
+    * handle loops and things properly, we really need dominance
+    * information.  It would end up basically being the into-SSA
+    * algorithm all over again.  It's easier if we just let
+    * lower_vars_to_ssa do that for us instead of repeating it here.
+    */
+   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+
+   struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
+   nir_variable *phi_var =
+      nir_local_variable_create(b->nb.impl, type->type, "phi");
+   _mesa_hash_table_insert(b->phi_table, w, phi_var);
+
+   val->ssa = vtn_variable_load(b, nir_deref_var_create(b, phi_var), type);
+
+   return true;
+}
+
+static bool
+vtn_handle_phi_second_pass(struct vtn_builder *b, SpvOp opcode,
+                           const uint32_t *w, unsigned count)
+{
+   if (opcode != SpvOpPhi)
+      return true;
+
+   struct hash_entry *phi_entry = _mesa_hash_table_search(b->phi_table, w);
+   assert(phi_entry);
+   nir_variable *phi_var = phi_entry->data;
+
+   struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
+
+   for (unsigned i = 3; i < count; i += 2) {
+      struct vtn_ssa_value *src = vtn_ssa_value(b, w[i]);
+      struct vtn_block *pred =
+         vtn_value(b, w[i + 1], vtn_value_type_block)->block;
+
+      b->nb.cursor = nir_after_block_before_jump(pred->end_block);
+
+      vtn_variable_store(b, src, nir_deref_var_create(b, phi_var), type);
+   }
+
+   return true;
+}
+
+static void
+vtn_emit_branch(struct vtn_builder *b, enum vtn_branch_type branch_type,
+                nir_variable *switch_fall_var, bool *has_switch_break)
+{
+   switch (branch_type) {
+   case vtn_branch_type_switch_break:
+      nir_store_var(&b->nb, switch_fall_var, nir_imm_int(&b->nb, NIR_FALSE), 1);
+      *has_switch_break = true;
+      break;
+   case vtn_branch_type_switch_fallthrough:
+      break; /* Nothing to do */
+   case vtn_branch_type_loop_break:
+      nir_jump(&b->nb, nir_jump_break);
+      break;
+   case vtn_branch_type_loop_continue:
+      nir_jump(&b->nb, nir_jump_continue);
+      break;
+   case vtn_branch_type_return:
+      nir_jump(&b->nb, nir_jump_return);
+      break;
+   case vtn_branch_type_discard: {
+      nir_intrinsic_instr *discard =
+         nir_intrinsic_instr_create(b->nb.shader, nir_intrinsic_discard);
+      nir_builder_instr_insert(&b->nb, &discard->instr);
+      break;
+   }
+   default:
+      unreachable("Invalid branch type");
+   }
+}
+
+static void
+vtn_emit_cf_list(struct vtn_builder *b, struct list_head *cf_list,
+                 nir_variable *switch_fall_var, bool *has_switch_break,
+                 vtn_instruction_handler handler)
+{
+   list_for_each_entry(struct vtn_cf_node, node, cf_list, link) {
+      switch (node->type) {
+      case vtn_cf_node_type_block: {
+         struct vtn_block *block = (struct vtn_block *)node;
+
+         const uint32_t *block_start = block->label;
+         const uint32_t *block_end = block->merge ? block->merge :
+                                                    block->branch;
+
+         block_start = vtn_foreach_instruction(b, block_start, block_end,
+                                               vtn_handle_phis_first_pass);
+
+         vtn_foreach_instruction(b, block_start, block_end, handler);
+
+         block->end_block = nir_cursor_current_block(b->nb.cursor);
+
+         if ((*block->branch & SpvOpCodeMask) == SpvOpReturnValue) {
+            struct vtn_ssa_value *src = vtn_ssa_value(b, block->branch[1]);
+            vtn_variable_store(b, src,
+                               nir_deref_var_create(b, b->impl->return_var),
+                               NULL);
+         }
+
+         if (block->branch_type != vtn_branch_type_none) {
+            vtn_emit_branch(b, block->branch_type,
+                            switch_fall_var, has_switch_break);
+         }
+
+         break;
+      }
+
+      case vtn_cf_node_type_if: {
+         struct vtn_if *vtn_if = (struct vtn_if *)node;
+
+         nir_if *if_stmt = nir_if_create(b->shader);
+         if_stmt->condition =
+            nir_src_for_ssa(vtn_ssa_value(b, vtn_if->condition)->def);
+         nir_cf_node_insert(b->nb.cursor, &if_stmt->cf_node);
+
+         bool sw_break = false;
+
+         b->nb.cursor = nir_after_cf_list(&if_stmt->then_list);
+         if (vtn_if->then_type == vtn_branch_type_none) {
+            vtn_emit_cf_list(b, &vtn_if->then_body,
+                             switch_fall_var, &sw_break, handler);
+         } else {
+            vtn_emit_branch(b, vtn_if->then_type, switch_fall_var, &sw_break);
+         }
+
+         b->nb.cursor = nir_after_cf_list(&if_stmt->else_list);
+         if (vtn_if->else_type == vtn_branch_type_none) {
+            vtn_emit_cf_list(b, &vtn_if->else_body,
+                             switch_fall_var, &sw_break, handler);
+         } else {
+            vtn_emit_branch(b, vtn_if->else_type, switch_fall_var, &sw_break);
+         }
+
+         b->nb.cursor = nir_after_cf_node(&if_stmt->cf_node);
+
+         /* If we encountered a switch break somewhere inside of the if,
+          * then it would have been handled correctly by calling
+          * emit_cf_list or emit_branch for the interrior.  However, we
+          * need to predicate everything following on wether or not we're
+          * still going.
+          */
+         if (sw_break) {
+            *has_switch_break = true;
+
+            nir_if *switch_if = nir_if_create(b->shader);
+            switch_if->condition =
+               nir_src_for_ssa(nir_load_var(&b->nb, switch_fall_var));
+            nir_cf_node_insert(b->nb.cursor, &switch_if->cf_node);
+
+            b->nb.cursor = nir_after_cf_list(&if_stmt->then_list);
+         }
+         break;
+      }
+
+      case vtn_cf_node_type_loop: {
+         struct vtn_loop *vtn_loop = (struct vtn_loop *)node;
+
+         nir_loop *loop = nir_loop_create(b->shader);
+         nir_cf_node_insert(b->nb.cursor, &loop->cf_node);
+
+         b->nb.cursor = nir_after_cf_list(&loop->body);
+         vtn_emit_cf_list(b, &vtn_loop->body, NULL, NULL, handler);
+
+         if (!list_empty(&vtn_loop->cont_body)) {
+            /* If we have a non-trivial continue body then we need to put
+             * it at the beginning of the loop with a flag to ensure that
+             * it doesn't get executed in the first iteration.
+             */
+            nir_variable *do_cont =
+               nir_local_variable_create(b->nb.impl, glsl_bool_type(), "cont");
+
+            b->nb.cursor = nir_before_cf_node(&loop->cf_node);
+            nir_store_var(&b->nb, do_cont, nir_imm_int(&b->nb, NIR_FALSE), 1);
+
+            b->nb.cursor = nir_before_cf_list(&loop->body);
+            nir_if *cont_if = nir_if_create(b->shader);
+            cont_if->condition = nir_src_for_ssa(nir_load_var(&b->nb, do_cont));
+            nir_cf_node_insert(b->nb.cursor, &cont_if->cf_node);
+
+            b->nb.cursor = nir_after_cf_list(&cont_if->then_list);
+            vtn_emit_cf_list(b, &vtn_loop->cont_body, NULL, NULL, handler);
+
+            b->nb.cursor = nir_after_cf_node(&cont_if->cf_node);
+            nir_store_var(&b->nb, do_cont, nir_imm_int(&b->nb, NIR_TRUE), 1);
+
+            b->has_loop_continue = true;
+         }
+
+         b->nb.cursor = nir_after_cf_node(&loop->cf_node);
+         break;
+      }
+
+      case vtn_cf_node_type_switch: {
+         struct vtn_switch *vtn_switch = (struct vtn_switch *)node;
+
+         /* First, we create a variable to keep track of whether or not the
+          * switch is still going at any given point.  Any switch breaks
+          * will set this variable to false.
+          */
+         nir_variable *fall_var =
+            nir_local_variable_create(b->nb.impl, glsl_bool_type(), "fall");
+         nir_store_var(&b->nb, fall_var, nir_imm_int(&b->nb, NIR_FALSE), 1);
+
+         /* Next, we gather up all of the conditions.  We have to do this
+          * up-front because we also need to build an "any" condition so
+          * that we can use !any for default.
+          */
+         const int num_cases = list_length(&vtn_switch->cases);
+         NIR_VLA(nir_ssa_def *, conditions, num_cases);
+
+         nir_ssa_def *sel = vtn_ssa_value(b, vtn_switch->selector)->def;
+         /* An accumulation of all conditions.  Used for the default */
+         nir_ssa_def *any = NULL;
+
+         int i = 0;
+         list_for_each_entry(struct vtn_case, cse, &vtn_switch->cases, link) {
+            if (cse->is_default) {
+               conditions[i++] = NULL;
+               continue;
+            }
+
+            nir_ssa_def *cond = NULL;
+            nir_array_foreach(&cse->values, uint32_t, val) {
+               nir_ssa_def *is_val =
+                  nir_ieq(&b->nb, sel, nir_imm_int(&b->nb, *val));
+
+               cond = cond ? nir_ior(&b->nb, cond, is_val) : is_val;
+            }
+
+            any = any ? nir_ior(&b->nb, any, cond) : cond;
+            conditions[i++] = cond;
+         }
+         assert(i == num_cases);
+
+         /* Now we can walk the list of cases and actually emit code */
+         i = 0;
+         list_for_each_entry(struct vtn_case, cse, &vtn_switch->cases, link) {
+            /* Figure out the condition */
+            nir_ssa_def *cond = conditions[i++];
+            if (cse->is_default) {
+               assert(cond == NULL);
+               cond = nir_inot(&b->nb, any);
+            }
+            /* Take fallthrough into account */
+            cond = nir_ior(&b->nb, cond, nir_load_var(&b->nb, fall_var));
+
+            nir_if *case_if = nir_if_create(b->nb.shader);
+            case_if->condition = nir_src_for_ssa(cond);
+            nir_cf_node_insert(b->nb.cursor, &case_if->cf_node);
+
+            bool has_break = false;
+            b->nb.cursor = nir_after_cf_list(&case_if->then_list);
+            nir_store_var(&b->nb, fall_var, nir_imm_int(&b->nb, NIR_TRUE), 1);
+            vtn_emit_cf_list(b, &cse->body, fall_var, &has_break, handler);
+            (void)has_break; /* We don't care */
+
+            b->nb.cursor = nir_after_cf_node(&case_if->cf_node);
+         }
+         assert(i == num_cases);
+
+         break;
+      }
+
+      default:
+         unreachable("Invalid CF node type");
+      }
+   }
+}
+
+void
+vtn_function_emit(struct vtn_builder *b, struct vtn_function *func,
+                  vtn_instruction_handler instruction_handler)
+{
+   nir_builder_init(&b->nb, func->impl);
+   b->nb.cursor = nir_after_cf_list(&func->impl->body);
+   b->has_loop_continue = false;
+   b->phi_table = _mesa_hash_table_create(b, _mesa_hash_pointer,
+                                          _mesa_key_pointer_equal);
+
+   vtn_emit_cf_list(b, &func->body, NULL, NULL, instruction_handler);
+
+   vtn_foreach_instruction(b, func->start_block->label, func->end,
+                           vtn_handle_phi_second_pass);
+
+   /* Continue blocks for loops get inserted before the body of the loop
+    * but instructions in the continue may use SSA defs in the loop body.
+    * Therefore, we need to repair SSA to insert the needed phi nodes.
+    */
+   if (b->has_loop_continue)
+      nir_repair_ssa_impl(func->impl);
+}
diff --git a/src/glsl/nir/spirv/vtn_glsl450.c b/src/glsl/nir/spirv/vtn_glsl450.c

new file mode 100644 (file)

index 0000000..eb81620
--- /dev/null
+++ b/src/glsl/nir/spirv/vtn_glsl450.c
@@ -0,0 +1,671 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand (jason@jlekstrand.net)
+ *
+ */
+
+#include "vtn_private.h"
+#include "GLSL.std.450.h"
+
+#define M_PIf   ((float) M_PI)
+#define M_PI_2f ((float) M_PI_2)
+#define M_PI_4f ((float) M_PI_4)
+
+static nir_ssa_def *
+build_mat2_det(nir_builder *b, nir_ssa_def *col[2])
+{
+   unsigned swiz[4] = {1, 0, 0, 0};
+   nir_ssa_def *p = nir_fmul(b, col[0], nir_swizzle(b, col[1], swiz, 2, true));
+   return nir_fsub(b, nir_channel(b, p, 0), nir_channel(b, p, 1));
+}
+
+static nir_ssa_def *
+build_mat3_det(nir_builder *b, nir_ssa_def *col[3])
+{
+   unsigned yzx[4] = {1, 2, 0, 0};
+   unsigned zxy[4] = {2, 0, 1, 0};
+
+   nir_ssa_def *prod0 =
+      nir_fmul(b, col[0],
+               nir_fmul(b, nir_swizzle(b, col[1], yzx, 3, true),
+                           nir_swizzle(b, col[2], zxy, 3, true)));
+   nir_ssa_def *prod1 =
+      nir_fmul(b, col[0],
+               nir_fmul(b, nir_swizzle(b, col[1], zxy, 3, true),
+                           nir_swizzle(b, col[2], yzx, 3, true)));
+
+   nir_ssa_def *diff = nir_fsub(b, prod0, prod1);
+
+   return nir_fadd(b, nir_channel(b, diff, 0),
+                      nir_fadd(b, nir_channel(b, diff, 1),
+                                  nir_channel(b, diff, 2)));
+}
+
+static nir_ssa_def *
+build_mat4_det(nir_builder *b, nir_ssa_def **col)
+{
+   nir_ssa_def *subdet[4];
+   for (unsigned i = 0; i < 4; i++) {
+      unsigned swiz[3];
+      for (unsigned j = 0; j < 4; j++)
+         swiz[j - (j > i)] = j;
+
+      nir_ssa_def *subcol[3];
+      subcol[0] = nir_swizzle(b, col[1], swiz, 3, true);
+      subcol[1] = nir_swizzle(b, col[2], swiz, 3, true);
+      subcol[2] = nir_swizzle(b, col[3], swiz, 3, true);
+
+      subdet[i] = build_mat3_det(b, subcol);
+   }
+
+   nir_ssa_def *prod = nir_fmul(b, col[0], nir_vec(b, subdet, 4));
+
+   return nir_fadd(b, nir_fsub(b, nir_channel(b, prod, 0),
+                                  nir_channel(b, prod, 1)),
+                      nir_fsub(b, nir_channel(b, prod, 2),
+                                  nir_channel(b, prod, 3)));
+}
+
+static nir_ssa_def *
+build_mat_det(struct vtn_builder *b, struct vtn_ssa_value *src)
+{
+   unsigned size = glsl_get_vector_elements(src->type);
+
+   nir_ssa_def *cols[4];
+   for (unsigned i = 0; i < size; i++)
+      cols[i] = src->elems[i]->def;
+
+   switch(size) {
+   case 2: return build_mat2_det(&b->nb, cols);
+   case 3: return build_mat3_det(&b->nb, cols);
+   case 4: return build_mat4_det(&b->nb, cols);
+   default:
+      unreachable("Invalid matrix size");
+   }
+}
+
+/* Computes the determinate of the submatrix given by taking src and
+ * removing the specified row and column.
+ */
+static nir_ssa_def *
+build_mat_subdet(struct nir_builder *b, struct vtn_ssa_value *src,
+                 unsigned size, unsigned row, unsigned col)
+{
+   assert(row < size && col < size);
+   if (size == 2) {
+      return nir_channel(b, src->elems[1 - col]->def, 1 - row);
+   } else {
+      /* Swizzle to get all but the specified row */
+      unsigned swiz[3];
+      for (unsigned j = 0; j < 4; j++)
+         swiz[j - (j > row)] = j;
+
+      /* Grab all but the specified column */
+      nir_ssa_def *subcol[3];
+      for (unsigned j = 0; j < size; j++) {
+         if (j != col) {
+            subcol[j - (j > col)] = nir_swizzle(b, src->elems[j]->def,
+                                                swiz, size - 1, true);
+         }
+      }
+
+      if (size == 3) {
+         return build_mat2_det(b, subcol);
+      } else {
+         assert(size == 4);
+         return build_mat3_det(b, subcol);
+      }
+   }
+}
+
+static struct vtn_ssa_value *
+matrix_inverse(struct vtn_builder *b, struct vtn_ssa_value *src)
+{
+   nir_ssa_def *adj_col[4];
+   unsigned size = glsl_get_vector_elements(src->type);
+
+   /* Build up an adjugate matrix */
+   for (unsigned c = 0; c < size; c++) {
+      nir_ssa_def *elem[4];
+      for (unsigned r = 0; r < size; r++) {
+         elem[r] = build_mat_subdet(&b->nb, src, size, c, r);
+
+         if ((r + c) % 2)
+            elem[r] = nir_fneg(&b->nb, elem[r]);
+      }
+
+      adj_col[c] = nir_vec(&b->nb, elem, size);
+   }
+
+   nir_ssa_def *det_inv = nir_frcp(&b->nb, build_mat_det(b, src));
+
+   struct vtn_ssa_value *val = vtn_create_ssa_value(b, src->type);
+   for (unsigned i = 0; i < size; i++)
+      val->elems[i]->def = nir_fmul(&b->nb, adj_col[i], det_inv);
+
+   return val;
+}
+
+static nir_ssa_def*
+build_length(nir_builder *b, nir_ssa_def *vec)
+{
+   switch (vec->num_components) {
+   case 1: return nir_fsqrt(b, nir_fmul(b, vec, vec));
+   case 2: return nir_fsqrt(b, nir_fdot2(b, vec, vec));
+   case 3: return nir_fsqrt(b, nir_fdot3(b, vec, vec));
+   case 4: return nir_fsqrt(b, nir_fdot4(b, vec, vec));
+   default:
+      unreachable("Invalid number of components");
+   }
+}
+
+static inline nir_ssa_def *
+build_fclamp(nir_builder *b,
+             nir_ssa_def *x, nir_ssa_def *min_val, nir_ssa_def *max_val)
+{
+   return nir_fmin(b, nir_fmax(b, x, min_val), max_val);
+}
+
+/**
+ * Return e^x.
+ */
+static nir_ssa_def *
+build_exp(nir_builder *b, nir_ssa_def *x)
+{
+   return nir_fexp2(b, nir_fmul(b, x, nir_imm_float(b, M_LOG2E)));
+}
+
+/**
+ * Return ln(x) - the natural logarithm of x.
+ */
+static nir_ssa_def *
+build_log(nir_builder *b, nir_ssa_def *x)
+{
+   return nir_fmul(b, nir_flog2(b, x), nir_imm_float(b, 1.0 / M_LOG2E));
+}
+
+static nir_ssa_def *
+build_asin(nir_builder *b, nir_ssa_def *x)
+{
+   nir_ssa_def *abs_x = nir_fabs(b, x);
+   return nir_fmul(b, nir_fsign(b, x),
+                   nir_fsub(b, nir_imm_float(b, M_PI_2f),
+                            nir_fmul(b, nir_fsqrt(b, nir_fsub(b, nir_imm_float(b, 1.0f), abs_x)),
+                                     nir_fadd(b, nir_imm_float(b, M_PI_2f),
+                                              nir_fmul(b, abs_x,
+                                                       nir_fadd(b, nir_imm_float(b, M_PI_4f - 1.0f),
+                                                                nir_fmul(b, abs_x,
+                                                                         nir_fadd(b, nir_imm_float(b, 0.086566724f),
+                                                                                  nir_fmul(b, abs_x,
+                                                                                           nir_imm_float(b, -0.03102955f))))))))));
+}
+
+/**
+ * Compute xs[0] + xs[1] + xs[2] + ... using fadd.
+ */
+static nir_ssa_def *
+build_fsum(nir_builder *b, nir_ssa_def **xs, int terms)
+{
+   nir_ssa_def *accum = xs[0];
+
+   for (int i = 1; i < terms; i++)
+      accum = nir_fadd(b, accum, xs[i]);
+
+   return accum;
+}
+
+static nir_ssa_def *
+build_atan(nir_builder *b, nir_ssa_def *y_over_x)
+{
+   nir_ssa_def *abs_y_over_x = nir_fabs(b, y_over_x);
+   nir_ssa_def *one = nir_imm_float(b, 1.0f);
+
+   /*
+    * range-reduction, first step:
+    *
+    *      / y_over_x         if |y_over_x| <= 1.0;
+    * x = <
+    *      \ 1.0 / y_over_x   otherwise
+    */
+   nir_ssa_def *x = nir_fdiv(b, nir_fmin(b, abs_y_over_x, one),
+                                nir_fmax(b, abs_y_over_x, one));
+
+   /*
+    * approximate atan by evaluating polynomial:
+    *
+    * x   * 0.9999793128310355 - x^3  * 0.3326756418091246 +
+    * x^5 * 0.1938924977115610 - x^7  * 0.1173503194786851 +
+    * x^9 * 0.0536813784310406 - x^11 * 0.0121323213173444
+    */
+   nir_ssa_def *x_2  = nir_fmul(b, x,   x);
+   nir_ssa_def *x_3  = nir_fmul(b, x_2, x);
+   nir_ssa_def *x_5  = nir_fmul(b, x_3, x_2);
+   nir_ssa_def *x_7  = nir_fmul(b, x_5, x_2);
+   nir_ssa_def *x_9  = nir_fmul(b, x_7, x_2);
+   nir_ssa_def *x_11 = nir_fmul(b, x_9, x_2);
+
+   nir_ssa_def *polynomial_terms[] = {
+      nir_fmul(b, x,    nir_imm_float(b,  0.9999793128310355f)),
+      nir_fmul(b, x_3,  nir_imm_float(b, -0.3326756418091246f)),
+      nir_fmul(b, x_5,  nir_imm_float(b,  0.1938924977115610f)),
+      nir_fmul(b, x_7,  nir_imm_float(b, -0.1173503194786851f)),
+      nir_fmul(b, x_9,  nir_imm_float(b,  0.0536813784310406f)),
+      nir_fmul(b, x_11, nir_imm_float(b, -0.0121323213173444f)),
+   };
+
+   nir_ssa_def *tmp =
+      build_fsum(b, polynomial_terms, ARRAY_SIZE(polynomial_terms));
+
+   /* range-reduction fixup */
+   tmp = nir_fadd(b, tmp,
+                  nir_fmul(b,
+                           nir_b2f(b, nir_flt(b, one, abs_y_over_x)),
+                           nir_fadd(b, nir_fmul(b, tmp,
+                                                nir_imm_float(b, -2.0f)),
+                                       nir_imm_float(b, M_PI_2f))));
+
+   /* sign fixup */
+   return nir_fmul(b, tmp, nir_fsign(b, y_over_x));
+}
+
+static nir_ssa_def *
+build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
+{
+   nir_ssa_def *zero = nir_imm_float(b, 0.0f);
+
+   /* If |x| >= 1.0e-8 * |y|: */
+   nir_if *if_stmt = nir_if_create(b->shader);
+   if_stmt->condition = nir_src_for_ssa(
+      nir_fge(b, nir_fabs(b, x),
+              nir_fmul(b, nir_imm_float(b, 1.0e-8f), nir_fabs(b, y))));
+   nir_builder_cf_insert(b, &if_stmt->cf_node);
+
+   /* Then...call atan(y/x) and fix it up: */
+   b->cursor = nir_after_cf_list(&if_stmt->then_list);
+   nir_ssa_def *atan1 = build_atan(b, nir_fdiv(b, y, x));
+   nir_ssa_def *r_then =
+      nir_bcsel(b, nir_flt(b, x, zero),
+                   nir_fadd(b, atan1,
+                               nir_bcsel(b, nir_fge(b, y, zero),
+                                            nir_imm_float(b, M_PIf),
+                                            nir_imm_float(b, -M_PIf))),
+                   atan1);
+
+   /* Else... */
+   b->cursor = nir_after_cf_list(&if_stmt->else_list);
+   nir_ssa_def *r_else =
+      nir_fmul(b, nir_fsign(b, y), nir_imm_float(b, M_PI_2f));
+
+   b->cursor = nir_after_cf_node(&if_stmt->cf_node);
+
+   nir_phi_instr *phi = nir_phi_instr_create(b->shader);
+   nir_ssa_dest_init(&phi->instr, &phi->dest, r_then->num_components, NULL);
+
+   nir_phi_src *phi_src0 = ralloc(phi, nir_phi_src);
+   nir_phi_src *phi_src1 = ralloc(phi, nir_phi_src);
+
+   phi_src0->pred = nir_cf_node_as_block((nir_cf_node *) exec_list_get_head(&if_stmt->then_list));
+   phi_src0->src = nir_src_for_ssa(r_then);
+   exec_list_push_tail(&phi->srcs, &phi_src0->node);
+   phi_src1->pred = nir_cf_node_as_block((nir_cf_node *) exec_list_get_head(&if_stmt->else_list));
+   phi_src1->src = nir_src_for_ssa(r_else);
+   exec_list_push_tail(&phi->srcs, &phi_src1->node);
+
+   nir_builder_instr_insert(b, &phi->instr);
+
+   return &phi->dest.ssa;
+}
+
+static nir_ssa_def *
+build_frexp(nir_builder *b, nir_ssa_def *x, nir_ssa_def **exponent)
+{
+   nir_ssa_def *abs_x = nir_fabs(b, x);
+   nir_ssa_def *zero = nir_imm_float(b, 0.0f);
+
+   /* Single-precision floating-point values are stored as
+    *   1 sign bit;
+    *   8 exponent bits;
+    *   23 mantissa bits.
+    *
+    * An exponent shift of 23 will shift the mantissa out, leaving only the
+    * exponent and sign bit (which itself may be zero, if the absolute value
+    * was taken before the bitcast and shift.
+    */
+   nir_ssa_def *exponent_shift = nir_imm_int(b, 23);
+   nir_ssa_def *exponent_bias = nir_imm_int(b, -126);
+
+   nir_ssa_def *sign_mantissa_mask = nir_imm_int(b, 0x807fffffu);
+
+   /* Exponent of floating-point values in the range [0.5, 1.0). */
+   nir_ssa_def *exponent_value = nir_imm_int(b, 0x3f000000u);
+
+   nir_ssa_def *is_not_zero = nir_fne(b, abs_x, zero);
+
+   *exponent =
+      nir_iadd(b, nir_ushr(b, abs_x, exponent_shift),
+                  nir_bcsel(b, is_not_zero, exponent_bias, zero));
+
+   return nir_ior(b, nir_iand(b, x, sign_mantissa_mask),
+                     nir_bcsel(b, is_not_zero, exponent_value, zero));
+}
+
+static void
+handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
+                   const uint32_t *w, unsigned count)
+{
+   struct nir_builder *nb = &b->nb;
+   const struct glsl_type *dest_type =
+      vtn_value(b, w[1], vtn_value_type_type)->type->type;
+
+   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+   val->ssa = vtn_create_ssa_value(b, dest_type);
+
+   /* Collect the various SSA sources */
+   unsigned num_inputs = count - 5;
+   nir_ssa_def *src[3];
+   for (unsigned i = 0; i < num_inputs; i++)
+      src[i] = vtn_ssa_value(b, w[i + 5])->def;
+
+   nir_op op;
+   switch (entrypoint) {
+   case GLSLstd450Round:       op = nir_op_fround_even;   break; /* TODO */
+   case GLSLstd450RoundEven:   op = nir_op_fround_even;   break;
+   case GLSLstd450Trunc:       op = nir_op_ftrunc;        break;
+   case GLSLstd450FAbs:        op = nir_op_fabs;          break;
+   case GLSLstd450SAbs:        op = nir_op_iabs;          break;
+   case GLSLstd450FSign:       op = nir_op_fsign;         break;
+   case GLSLstd450SSign:       op = nir_op_isign;         break;
+   case GLSLstd450Floor:       op = nir_op_ffloor;        break;
+   case GLSLstd450Ceil:        op = nir_op_fceil;         break;
+   case GLSLstd450Fract:       op = nir_op_ffract;        break;
+   case GLSLstd450Radians:
+      val->ssa->def = nir_fmul(nb, src[0], nir_imm_float(nb, 0.01745329251));
+      return;
+   case GLSLstd450Degrees:
+      val->ssa->def = nir_fmul(nb, src[0], nir_imm_float(nb, 57.2957795131));
+      return;
+   case GLSLstd450Sin:         op = nir_op_fsin;       break;
+   case GLSLstd450Cos:         op = nir_op_fcos;       break;
+   case GLSLstd450Tan:
+      val->ssa->def = nir_fdiv(nb, nir_fsin(nb, src[0]),
+                               nir_fcos(nb, src[0]));
+      return;
+   case GLSLstd450Pow:         op = nir_op_fpow;       break;
+   case GLSLstd450Exp2:        op = nir_op_fexp2;      break;
+   case GLSLstd450Log2:        op = nir_op_flog2;      break;
+   case GLSLstd450Sqrt:        op = nir_op_fsqrt;      break;
+   case GLSLstd450InverseSqrt: op = nir_op_frsq;       break;
+
+   case GLSLstd450Modf: {
+      val->ssa->def = nir_ffract(nb, src[0]);
+      nir_deref_var *out = vtn_value(b, w[6], vtn_value_type_deref)->deref;
+      nir_store_deref_var(nb, out, nir_ffloor(nb, src[0]), 0xf);
+      return;
+   }
+
+   op = nir_op_fmod;       break;
+   case GLSLstd450FMin:        op = nir_op_fmin;       break;
+   case GLSLstd450UMin:        op = nir_op_umin;       break;
+   case GLSLstd450SMin:        op = nir_op_imin;       break;
+   case GLSLstd450FMax:        op = nir_op_fmax;       break;
+   case GLSLstd450UMax:        op = nir_op_umax;       break;
+   case GLSLstd450SMax:        op = nir_op_imax;       break;
+   case GLSLstd450FMix:        op = nir_op_flrp;       break;
+   case GLSLstd450Step:
+      val->ssa->def = nir_sge(nb, src[1], src[0]);
+      return;
+
+   case GLSLstd450Fma:         op = nir_op_ffma;       break;
+   case GLSLstd450Ldexp:       op = nir_op_ldexp;      break;
+
+   /* Packing/Unpacking functions */
+   case GLSLstd450PackSnorm4x8:      op = nir_op_pack_snorm_4x8;      break;
+   case GLSLstd450PackUnorm4x8:      op = nir_op_pack_unorm_4x8;      break;
+   case GLSLstd450PackSnorm2x16:     op = nir_op_pack_snorm_2x16;     break;
+   case GLSLstd450PackUnorm2x16:     op = nir_op_pack_unorm_2x16;     break;
+   case GLSLstd450PackHalf2x16:      op = nir_op_pack_half_2x16;      break;
+   case GLSLstd450UnpackSnorm4x8:    op = nir_op_unpack_snorm_4x8;    break;
+   case GLSLstd450UnpackUnorm4x8:    op = nir_op_unpack_unorm_4x8;    break;
+   case GLSLstd450UnpackSnorm2x16:   op = nir_op_unpack_snorm_2x16;   break;
+   case GLSLstd450UnpackUnorm2x16:   op = nir_op_unpack_unorm_2x16;   break;
+   case GLSLstd450UnpackHalf2x16:    op = nir_op_unpack_half_2x16;    break;
+
+   case GLSLstd450Length:
+      val->ssa->def = build_length(nb, src[0]);
+      return;
+   case GLSLstd450Distance:
+      val->ssa->def = build_length(nb, nir_fsub(nb, src[0], src[1]));
+      return;
+   case GLSLstd450Normalize:
+      val->ssa->def = nir_fdiv(nb, src[0], build_length(nb, src[0]));
+      return;
+
+   case GLSLstd450Exp:
+      val->ssa->def = build_exp(nb, src[0]);
+      return;
+
+   case GLSLstd450Log:
+      val->ssa->def = build_log(nb, src[0]);
+      return;
+
+   case GLSLstd450FClamp:
+      val->ssa->def = build_fclamp(nb, src[0], src[1], src[2]);
+      return;
+   case GLSLstd450UClamp:
+      val->ssa->def = nir_umin(nb, nir_umax(nb, src[0], src[1]), src[2]);
+      return;
+   case GLSLstd450SClamp:
+      val->ssa->def = nir_imin(nb, nir_imax(nb, src[0], src[1]), src[2]);
+      return;
+
+   case GLSLstd450Cross: {
+      unsigned yzx[4] = { 1, 2, 0, 0 };
+      unsigned zxy[4] = { 2, 0, 1, 0 };
+      val->ssa->def =
+         nir_fsub(nb, nir_fmul(nb, nir_swizzle(nb, src[0], yzx, 3, true),
+                                   nir_swizzle(nb, src[1], zxy, 3, true)),
+                      nir_fmul(nb, nir_swizzle(nb, src[0], zxy, 3, true),
+                                   nir_swizzle(nb, src[1], yzx, 3, true)));
+      return;
+   }
+
+   case GLSLstd450SmoothStep: {
+      /* t = clamp((x - edge0) / (edge1 - edge0), 0, 1) */
+      nir_ssa_def *t =
+         build_fclamp(nb, nir_fdiv(nb, nir_fsub(nb, src[2], src[0]),
+                                       nir_fsub(nb, src[1], src[0])),
+                          nir_imm_float(nb, 0.0), nir_imm_float(nb, 1.0));
+      /* result = t * t * (3 - 2 * t) */
+      val->ssa->def =
+         nir_fmul(nb, t, nir_fmul(nb, t,
+            nir_fsub(nb, nir_imm_float(nb, 3.0),
+                         nir_fmul(nb, nir_imm_float(nb, 2.0), t))));
+      return;
+   }
+
+   case GLSLstd450FaceForward:
+      val->ssa->def =
+         nir_bcsel(nb, nir_flt(nb, nir_fdot(nb, src[2], src[1]),
+                                   nir_imm_float(nb, 0.0)),
+                       src[0], nir_fneg(nb, src[0]));
+      return;
+
+   case GLSLstd450Reflect:
+      /* I - 2 * dot(N, I) * N */
+      val->ssa->def =
+         nir_fsub(nb, src[0], nir_fmul(nb, nir_imm_float(nb, 2.0),
+                              nir_fmul(nb, nir_fdot(nb, src[0], src[1]),
+                                           src[1])));
+      return;
+
+   case GLSLstd450Refract: {
+      nir_ssa_def *I = src[0];
+      nir_ssa_def *N = src[1];
+      nir_ssa_def *eta = src[2];
+      nir_ssa_def *n_dot_i = nir_fdot(nb, N, I);
+      nir_ssa_def *one = nir_imm_float(nb, 1.0);
+      nir_ssa_def *zero = nir_imm_float(nb, 0.0);
+      /* k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I)) */
+      nir_ssa_def *k =
+         nir_fsub(nb, one, nir_fmul(nb, eta, nir_fmul(nb, eta,
+                      nir_fsub(nb, one, nir_fmul(nb, n_dot_i, n_dot_i)))));
+      nir_ssa_def *result =
+         nir_fsub(nb, nir_fmul(nb, eta, I),
+                      nir_fmul(nb, nir_fadd(nb, nir_fmul(nb, eta, n_dot_i),
+                                                nir_fsqrt(nb, k)), N));
+      /* XXX: bcsel, or if statement? */
+      val->ssa->def = nir_bcsel(nb, nir_flt(nb, k, zero), zero, result);
+      return;
+   }
+
+   case GLSLstd450Sinh:
+      /* 0.5 * (e^x - e^(-x)) */
+      val->ssa->def =
+         nir_fmul(nb, nir_imm_float(nb, 0.5f),
+                      nir_fsub(nb, build_exp(nb, src[0]),
+                                   build_exp(nb, nir_fneg(nb, src[0]))));
+      return;
+
+   case GLSLstd450Cosh:
+      /* 0.5 * (e^x + e^(-x)) */
+      val->ssa->def =
+         nir_fmul(nb, nir_imm_float(nb, 0.5f),
+                      nir_fadd(nb, build_exp(nb, src[0]),
+                                   build_exp(nb, nir_fneg(nb, src[0]))));
+      return;
+
+   case GLSLstd450Tanh:
+      /* (e^x - e^(-x)) / (e^x + e^(-x)) */
+      val->ssa->def =
+         nir_fdiv(nb, nir_fsub(nb, build_exp(nb, src[0]),
+                                   build_exp(nb, nir_fneg(nb, src[0]))),
+                      nir_fadd(nb, build_exp(nb, src[0]),
+                                   build_exp(nb, nir_fneg(nb, src[0]))));
+      return;
+
+   case GLSLstd450Asinh:
+      val->ssa->def = nir_fmul(nb, nir_fsign(nb, src[0]),
+         build_log(nb, nir_fadd(nb, nir_fabs(nb, src[0]),
+                       nir_fsqrt(nb, nir_fadd(nb, nir_fmul(nb, src[0], src[0]),
+                                                  nir_imm_float(nb, 1.0f))))));
+      return;
+   case GLSLstd450Acosh:
+      val->ssa->def = build_log(nb, nir_fadd(nb, src[0],
+         nir_fsqrt(nb, nir_fsub(nb, nir_fmul(nb, src[0], src[0]),
+                                    nir_imm_float(nb, 1.0f)))));
+      return;
+   case GLSLstd450Atanh: {
+      nir_ssa_def *one = nir_imm_float(nb, 1.0);
+      val->ssa->def = nir_fmul(nb, nir_imm_float(nb, 0.5f),
+         build_log(nb, nir_fdiv(nb, nir_fadd(nb, one, src[0]),
+                                    nir_fsub(nb, one, src[0]))));
+      return;
+   }
+
+   case GLSLstd450FindILsb:   op = nir_op_find_lsb;   break;
+   case GLSLstd450FindSMsb:   op = nir_op_ifind_msb;  break;
+   case GLSLstd450FindUMsb:   op = nir_op_ufind_msb;  break;
+
+   case GLSLstd450Asin:
+      val->ssa->def = build_asin(nb, src[0]);
+      return;
+
+   case GLSLstd450Acos:
+      val->ssa->def = nir_fsub(nb, nir_imm_float(nb, M_PI_2f),
+                                   build_asin(nb, src[0]));
+      return;
+
+   case GLSLstd450Atan:
+      val->ssa->def = build_atan(nb, src[0]);
+      return;
+
+   case GLSLstd450Atan2:
+      val->ssa->def = build_atan2(nb, src[0], src[1]);
+      return;
+
+   case GLSLstd450Frexp: {
+      nir_ssa_def *exponent;
+      val->ssa->def = build_frexp(nb, src[0], &exponent);
+      nir_deref_var *out = vtn_value(b, w[6], vtn_value_type_deref)->deref;
+      nir_store_deref_var(nb, out, exponent, 0xf);
+      return;
+   }
+
+   case GLSLstd450FrexpStruct: {
+      assert(glsl_type_is_struct(val->ssa->type));
+      val->ssa->elems[0]->def = build_frexp(nb, src[0],
+                                            &val->ssa->elems[1]->def);
+      return;
+   }
+
+   case GLSLstd450ModfStruct:
+   case GLSLstd450PackDouble2x32:
+   case GLSLstd450UnpackDouble2x32:
+   case GLSLstd450IMix:
+   default:
+      unreachable("Unhandled opcode");
+   }
+
+   nir_alu_instr *instr = nir_alu_instr_create(b->shader, op);
+   nir_ssa_dest_init(&instr->instr, &instr->dest.dest,
+                     glsl_get_vector_elements(val->ssa->type), val->name);
+   instr->dest.write_mask = (1 << instr->dest.dest.ssa.num_components) - 1;
+   val->ssa->def = &instr->dest.dest.ssa;
+
+   for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++)
+      instr->src[i].src = nir_src_for_ssa(src[i]);
+
+   nir_builder_instr_insert(nb, &instr->instr);
+}
+
+bool
+vtn_handle_glsl450_instruction(struct vtn_builder *b, uint32_t ext_opcode,
+                               const uint32_t *w, unsigned count)
+{
+   switch ((enum GLSLstd450)ext_opcode) {
+   case GLSLstd450Determinant: {
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+      val->ssa = rzalloc(b, struct vtn_ssa_value);
+      val->ssa->type = vtn_value(b, w[1], vtn_value_type_type)->type->type;
+      val->ssa->def = build_mat_det(b, vtn_ssa_value(b, w[5]));
+      break;
+   }
+
+   case GLSLstd450MatrixInverse: {
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
+      val->ssa = matrix_inverse(b, vtn_ssa_value(b, w[5]));
+      break;
+   }
+
+   case GLSLstd450InterpolateAtCentroid:
+   case GLSLstd450InterpolateAtSample:
+   case GLSLstd450InterpolateAtOffset:
+      unreachable("Unhandled opcode");
+
+   default:
+      handle_glsl450_alu(b, (enum GLSLstd450)ext_opcode, w, count);
+   }
+
+   return true;
+}
diff --git a/src/glsl/nir/spirv/vtn_private.h b/src/glsl/nir/spirv/vtn_private.h

new file mode 100644 (file)

index 0000000..9a066d6
--- /dev/null
+++ b/src/glsl/nir/spirv/vtn_private.h
@@ -0,0 +1,417 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand (jason@jlekstrand.net)
+ *
+ */
+
+#include "nir/nir.h"
+#include "nir/nir_builder.h"
+#include "nir/nir_array.h"
+#include "nir_spirv.h"
+#include "spirv.h"
+
+struct vtn_builder;
+struct vtn_decoration;
+
+enum vtn_value_type {
+   vtn_value_type_invalid = 0,
+   vtn_value_type_undef,
+   vtn_value_type_string,
+   vtn_value_type_decoration_group,
+   vtn_value_type_type,
+   vtn_value_type_constant,
+   vtn_value_type_deref,
+   vtn_value_type_function,
+   vtn_value_type_block,
+   vtn_value_type_ssa,
+   vtn_value_type_extension,
+   vtn_value_type_image_pointer,
+   vtn_value_type_sampled_image,
+};
+
+enum vtn_branch_type {
+   vtn_branch_type_none,
+   vtn_branch_type_switch_break,
+   vtn_branch_type_switch_fallthrough,
+   vtn_branch_type_loop_break,
+   vtn_branch_type_loop_continue,
+   vtn_branch_type_discard,
+   vtn_branch_type_return,
+};
+
+enum vtn_cf_node_type {
+   vtn_cf_node_type_block,
+   vtn_cf_node_type_if,
+   vtn_cf_node_type_loop,
+   vtn_cf_node_type_switch,
+};
+
+struct vtn_cf_node {
+   struct list_head link;
+   enum vtn_cf_node_type type;
+};
+
+struct vtn_loop {
+   struct vtn_cf_node node;
+
+   /* The main body of the loop */
+   struct list_head body;
+
+   /* The "continue" part of the loop.  This gets executed after the body
+    * and is where you go when you hit a continue.
+    */
+   struct list_head cont_body;
+
+   SpvLoopControlMask control;
+};
+
+struct vtn_if {
+   struct vtn_cf_node node;
+
+   uint32_t condition;
+
+   enum vtn_branch_type then_type;
+   struct list_head then_body;
+
+   enum vtn_branch_type else_type;
+   struct list_head else_body;
+
+   SpvSelectionControlMask control;
+};
+
+struct vtn_case {
+   struct list_head link;
+
+   struct list_head body;
+
+   /* The block that starts this case */
+   struct vtn_block *start_block;
+
+   /* The fallthrough case, if any */
+   struct vtn_case *fallthrough;
+
+   /* The uint32_t values that map to this case */
+   nir_array values;
+
+   /* True if this is the default case */
+   bool is_default;
+
+   /* Initialized to false; used when sorting the list of cases */
+   bool visited;
+};
+
+struct vtn_switch {
+   struct vtn_cf_node node;
+
+   uint32_t selector;
+
+   struct list_head cases;
+};
+
+struct vtn_block {
+   struct vtn_cf_node node;
+
+   /** A pointer to the label instruction */
+   const uint32_t *label;
+
+   /** A pointer to the merge instruction (or NULL if non exists) */
+   const uint32_t *merge;
+
+   /** A pointer to the branch instruction that ends this block */
+   const uint32_t *branch;
+
+   enum vtn_branch_type branch_type;
+
+   /** Points to the loop that this block starts (if it starts a loop) */
+   struct vtn_loop *loop;
+
+   /** Points to the switch case started by this block (if any) */
+   struct vtn_case *switch_case;
+
+   /** The last block in this SPIR-V block. */
+   nir_block *end_block;
+};
+
+struct vtn_function {
+   struct exec_node node;
+
+   nir_function_impl *impl;
+   struct vtn_block *start_block;
+
+   struct list_head body;
+
+   const uint32_t *end;
+
+   SpvFunctionControlMask control;
+};
+
+typedef bool (*vtn_instruction_handler)(struct vtn_builder *, uint32_t,
+                                        const uint32_t *, unsigned);
+
+void vtn_build_cfg(struct vtn_builder *b, const uint32_t *words,
+                   const uint32_t *end);
+void vtn_function_emit(struct vtn_builder *b, struct vtn_function *func,
+                       vtn_instruction_handler instruction_handler);
+
+const uint32_t *
+vtn_foreach_instruction(struct vtn_builder *b, const uint32_t *start,
+                        const uint32_t *end, vtn_instruction_handler handler);
+
+struct vtn_ssa_value {
+   union {
+      nir_ssa_def *def;
+      struct vtn_ssa_value **elems;
+   };
+
+   /* For matrices, if this is non-NULL, then this value is actually the
+    * transpose of some other value.  The value that `transposed` points to
+    * always dominates this value.
+    */
+   struct vtn_ssa_value *transposed;
+
+   const struct glsl_type *type;
+};
+
+struct vtn_type {
+   const struct glsl_type *type;
+
+   /* for matrices, whether the matrix is stored row-major */
+   bool row_major;
+
+   /* for structs, the offset of each member */
+   unsigned *offsets;
+
+   /* for structs, whether it was decorated as a "non-SSBO-like" block */
+   bool block;
+
+   /* for structs, whether it was decorated as an "SSBO-like" block */
+   bool buffer_block;
+
+   /* for structs with block == true, whether this is a builtin block (i.e. a
+    * block that contains only builtins).
+    */
+   bool builtin_block;
+
+   /* Image format for image_load_store type images */
+   unsigned image_format;
+
+   /* Access qualifier for storage images */
+   SpvAccessQualifier access_qualifier;
+
+   /* for arrays and matrices, the array stride */
+   unsigned stride;
+
+   /* for arrays, the vtn_type for the elements of the array */
+   struct vtn_type *array_element;
+
+   /* for structures, the vtn_type for each member */
+   struct vtn_type **members;
+
+   /* Whether this type, or a parent type, has been decorated as a builtin */
+   bool is_builtin;
+
+   SpvBuiltIn builtin;
+};
+
+struct vtn_image_pointer {
+   nir_deref_var *deref;
+   nir_ssa_def *coord;
+   nir_ssa_def *sample;
+};
+
+struct vtn_sampled_image {
+   nir_deref_var *image; /* Image or array of images */
+   nir_deref_var *sampler; /* Sampler */
+};
+
+struct vtn_value {
+   enum vtn_value_type value_type;
+   const char *name;
+   struct vtn_decoration *decoration;
+   union {
+      void *ptr;
+      char *str;
+      struct vtn_type *type;
+      struct {
+         nir_constant *constant;
+         const struct glsl_type *const_type;
+      };
+      struct {
+         nir_deref_var *deref;
+         struct vtn_type *deref_type;
+      };
+      struct vtn_image_pointer *image;
+      struct vtn_sampled_image *sampled_image;
+      struct vtn_function *func;
+      struct vtn_block *block;
+      struct vtn_ssa_value *ssa;
+      vtn_instruction_handler ext_handler;
+   };
+};
+
+#define VTN_DEC_DECORATION -1
+#define VTN_DEC_EXECUTION_MODE -2
+#define VTN_DEC_STRUCT_MEMBER0 0
+
+struct vtn_decoration {
+   struct vtn_decoration *next;
+
+   /* Specifies how to apply this decoration.  Negative values represent a
+    * decoration or execution mode. (See the VTN_DEC_ #defines above.)
+    * Non-negative values specify that it applies to a structure member.
+    */
+   int scope;
+
+   const uint32_t *literals;
+   struct vtn_value *group;
+
+   union {
+      SpvDecoration decoration;
+      SpvExecutionMode exec_mode;
+   };
+};
+
+struct vtn_builder {
+   nir_builder nb;
+
+   nir_shader *shader;
+   nir_function_impl *impl;
+   struct vtn_block *block;
+
+   /* Current file, line, and column.  Useful for debugging.  Set
+    * automatically by vtn_foreach_instruction.
+    */
+   char *file;
+   int line, col;
+
+   /*
+    * In SPIR-V, constants are global, whereas in NIR, the load_const
+    * instruction we use is per-function. So while we parse each function, we
+    * keep a hash table of constants we've resolved to nir_ssa_value's so
+    * far, and we lazily resolve them when we see them used in a function.
+    */
+   struct hash_table *const_table;
+
+   /*
+    * Map from phi instructions (pointer to the start of the instruction)
+    * to the variable corresponding to it.
+    */
+   struct hash_table *phi_table;
+
+   unsigned num_specializations;
+   struct nir_spirv_specialization *specializations;
+
+   /*
+    * NIR variable for each SPIR-V builtin.
+    */
+   struct {
+      nir_variable *in;
+      nir_variable *out;
+   } builtins[42]; /* XXX need symbolic constant from SPIR-V header */
+
+   unsigned value_id_bound;
+   struct vtn_value *values;
+
+   gl_shader_stage entry_point_stage;
+   const char *entry_point_name;
+   struct vtn_value *entry_point;
+   bool origin_upper_left;
+
+   struct vtn_function *func;
+   struct exec_list functions;
+
+   /* Current function parameter index */
+   unsigned func_param_idx;
+
+   bool has_loop_continue;
+};
+
+static inline struct vtn_value *
+vtn_push_value(struct vtn_builder *b, uint32_t value_id,
+               enum vtn_value_type value_type)
+{
+   assert(value_id < b->value_id_bound);
+   assert(b->values[value_id].value_type == vtn_value_type_invalid);
+
+   b->values[value_id].value_type = value_type;
+
+   return &b->values[value_id];
+}
+
+static inline struct vtn_value *
+vtn_untyped_value(struct vtn_builder *b, uint32_t value_id)
+{
+   assert(value_id < b->value_id_bound);
+   return &b->values[value_id];
+}
+
+static inline struct vtn_value *
+vtn_value(struct vtn_builder *b, uint32_t value_id,
+          enum vtn_value_type value_type)
+{
+   struct vtn_value *val = vtn_untyped_value(b, value_id);
+   assert(val->value_type == value_type);
+   return val;
+}
+
+struct vtn_ssa_value *vtn_ssa_value(struct vtn_builder *b, uint32_t value_id);
+
+struct vtn_ssa_value *vtn_create_ssa_value(struct vtn_builder *b,
+                                           const struct glsl_type *type);
+
+struct vtn_ssa_value *vtn_ssa_transpose(struct vtn_builder *b,
+                                        struct vtn_ssa_value *src);
+
+struct vtn_ssa_value *
+vtn_variable_load(struct vtn_builder *b, nir_deref_var *src,
+                  struct vtn_type *src_type);
+
+void vtn_variable_store(struct vtn_builder *b, struct vtn_ssa_value *src,
+                        nir_deref_var *dest, struct vtn_type *dest_type);
+
+
+typedef void (*vtn_decoration_foreach_cb)(struct vtn_builder *,
+                                          struct vtn_value *,
+                                          int member,
+                                          const struct vtn_decoration *,
+                                          void *);
+
+void vtn_foreach_decoration(struct vtn_builder *b, struct vtn_value *value,
+                            vtn_decoration_foreach_cb cb, void *data);
+
+typedef void (*vtn_execution_mode_foreach_cb)(struct vtn_builder *,
+                                              struct vtn_value *,
+                                              const struct vtn_decoration *,
+                                              void *);
+
+void vtn_foreach_execution_mode(struct vtn_builder *b, struct vtn_value *value,
+                                vtn_execution_mode_foreach_cb cb, void *data);
+
+nir_op vtn_nir_alu_op_for_spirv_opcode(SpvOp opcode, bool *swap);
+
+void vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
+                    const uint32_t *w, unsigned count);
+
+bool vtn_handle_glsl450_instruction(struct vtn_builder *b, uint32_t ext_opcode,
+                                    const uint32_t *words, unsigned count);
diff --git a/src/glsl/nir/spirv2nir.c b/src/glsl/nir/spirv2nir.c

new file mode 100644 (file)

index 0000000..c837186
--- /dev/null
+++ b/src/glsl/nir/spirv2nir.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand (jason@jlekstrand.net)
+ *
+ */
+
+/*
+ * A simple executable that opens a SPIR-V shader, converts it to NIR, and
+ * dumps out the result.  This should be useful for testing the
+ * spirv_to_nir code.
+ */
+
+#include "spirv/nir_spirv.h"
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+int main(int argc, char **argv)
+{
+   int fd = open(argv[1], O_RDONLY);
+   off_t len = lseek(fd, 0, SEEK_END);
+
+   assert(len % 4 == 0);
+   size_t word_count = len / 4;
+
+   const void *map = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0);
+   assert(map != NULL);
+
+   nir_function *func = spirv_to_nir(map, word_count, NULL, 0,
+                                     MESA_SHADER_FRAGMENT, "main", NULL);
+   nir_print_shader(func->shader, stderr);
+}
diff --git a/src/glsl/nir/tests/control_flow_tests.cpp b/src/glsl/nir/tests/control_flow_tests.cpp

index b9f90e6..b9379ef 100644 (file)
--- a/src/glsl/nir/tests/control_flow_tests.cpp
+++ b/src/glsl/nir/tests/control_flow_tests.cpp
@@ -30,24 +30,17 @@ protected:
     ~nir_cf_test();
  
     nir_builder b;
-   nir_shader *shader;
-   nir_function_impl *impl;
  };
  
  nir_cf_test::nir_cf_test()
  {
     static const nir_shader_compiler_options options = { };
-   shader = nir_shader_create(NULL, MESA_SHADER_VERTEX, &options);
-   nir_function *func = nir_function_create(shader, "main");
-   nir_function_overload *overload = nir_function_overload_create(func);
-   impl = nir_function_impl_create(overload);
-
-   nir_builder_init(&b, impl);
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, &options);
  }
  
  nir_cf_test::~nir_cf_test()
  {
-   ralloc_free(shader);
+   ralloc_free(b.shader);
  }
  
  TEST_F(nir_cf_test, delete_break_in_loop)
@@ -56,12 +49,12 @@ TEST_F(nir_cf_test, delete_break_in_loop)
      *
      * while (...) { break; }
      */
-   nir_loop *loop = nir_loop_create(shader);
-   nir_cf_node_insert(nir_after_cf_list(&impl->body), &loop->cf_node);
+   nir_loop *loop = nir_loop_create(b.shader);
+   nir_cf_node_insert(nir_after_cf_list(&b.impl->body), &loop->cf_node);
  
     b.cursor = nir_after_cf_list(&loop->body);
  
-   nir_jump_instr *jump = nir_jump_instr_create(shader, nir_jump_break);
+   nir_jump_instr *jump = nir_jump_instr_create(b.shader, nir_jump_break);
     nir_builder_instr_insert(&b, &jump->instr);
  
     /* At this point, we should have:
@@ -82,10 +75,10 @@ TEST_F(nir_cf_test, delete_break_in_loop)
      *         block block_3:
      * }
      */
-   nir_block *block_0 = nir_start_block(impl);
+   nir_block *block_0 = nir_start_block(b.impl);
     nir_block *block_1 = nir_cf_node_as_block(nir_loop_first_cf_node(loop));
     nir_block *block_2 = nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node));
-   nir_block *block_3 = impl->end_block;
+   nir_block *block_3 = b.impl->end_block;
     ASSERT_EQ(nir_cf_node_block, block_0->cf_node.type);
     ASSERT_EQ(nir_cf_node_block, block_1->cf_node.type);
     ASSERT_EQ(nir_cf_node_block, block_2->cf_node.type);
@@ -108,12 +101,12 @@ TEST_F(nir_cf_test, delete_break_in_loop)
     EXPECT_TRUE(_mesa_set_search(block_2->predecessors, block_1));
     EXPECT_TRUE(_mesa_set_search(block_3->predecessors, block_2));
  
-   nir_print_shader(shader, stderr);
+   nir_print_shader(b.shader, stderr);
  
     /* Now remove the break. */
     nir_instr_remove(&jump->instr);
  
-   nir_print_shader(shader, stderr);
+   nir_print_shader(b.shader, stderr);
  
     /* At this point, we should have:
      *
@@ -151,5 +144,5 @@ TEST_F(nir_cf_test, delete_break_in_loop)
     EXPECT_TRUE(_mesa_set_search(block_2->predecessors, block_1));
     EXPECT_TRUE(_mesa_set_search(block_3->predecessors, block_2));
  
-   nir_metadata_require(impl, nir_metadata_dominance);
+   nir_metadata_require(b.impl, nir_metadata_dominance);
  }
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp

index 1f69d0d..0f7a16a 100644 (file)
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -35,6 +35,12 @@
  #include "util/ralloc.h"
  #include "util/strtod.h"
  
+extern "C" void
+_mesa_error_no_memory(const char *caller)
+{
+   fprintf(stderr, "Mesa error: out of memory in %s", caller);
+}
+
  void
  _mesa_warning(struct gl_context *ctx, const char *fmt, ...)
  {
@@ -124,6 +130,11 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
        shProg->InterfaceBlockStageIndex[i] = NULL;
     }
  
+   ralloc_free(shProg->UboInterfaceBlockIndex);
+   shProg->UboInterfaceBlockIndex = NULL;
+   ralloc_free(shProg->SsboInterfaceBlockIndex);
+   shProg->SsboInterfaceBlockIndex = NULL;
+
     ralloc_free(shProg->AtomicBuffers);
     shProg->AtomicBuffers = NULL;
     shProg->NumAtomicBuffers = 0;
@@ -149,6 +160,7 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
     ctx->Extensions.ARB_gpu_shader_fp64 = true;
     ctx->Extensions.ARB_sample_shading = true;
     ctx->Extensions.ARB_shader_bit_encoding = true;
+   ctx->Extensions.ARB_shader_draw_parameters = true;
     ctx->Extensions.ARB_shader_stencil_export = true;
     ctx->Extensions.ARB_shader_subroutine = true;
     ctx->Extensions.ARB_shader_texture_lod = true;
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c

index ee24312..8bdbb9c 100644 (file)
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -437,7 +437,8 @@ dri3_wait_x(struct glx_context *gc)
     struct dri3_drawable *priv = (struct dri3_drawable *)
        GetGLXDRIDrawable(gc->currentDpy, gc->currentDrawable);
  
-   loader_dri3_wait_x(&priv->loader_drawable);
+   if (priv)
+      loader_dri3_wait_x(&priv->loader_drawable);
  }
  
  static void
@@ -446,7 +447,8 @@ dri3_wait_gl(struct glx_context *gc)
     struct dri3_drawable *priv = (struct dri3_drawable *)
        GetGLXDRIDrawable(gc->currentDpy, gc->currentDrawable);
  
-   loader_dri3_wait_gl(&priv->loader_drawable);
+   if (priv)
+      loader_dri3_wait_gl(&priv->loader_drawable);
  }
  
  /**
diff --git a/src/isl/.gitignore b/src/isl/.gitignore

new file mode 100644 (file)

index 0000000..e9cfd67
--- /dev/null
+++ b/src/isl/.gitignore
@@ -0,0 +1 @@
+/isl_format_layout.c
diff --git a/src/isl/Makefile.am b/src/isl/Makefile.am

new file mode 100644 (file)

index 0000000..72f5460
--- /dev/null
+++ b/src/isl/Makefile.am
@@ -0,0 +1,90 @@
+# Copyright 2015 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+SUBDIRS = .
+
+noinst_LTLIBRARIES = libisl.la
+
+EXTRA_DIST = tests
+
+# The gallium includes are for the util/u_math.h include from main/macros.h
+AM_CPPFLAGS = \
+       $(INTEL_CFLAGS) \
+       $(VALGRIND_CFLAGS) \
+       $(DEFINES) \
+       -I$(top_srcdir)/include \
+       -I$(top_srcdir)/src \
+       -I$(top_srcdir)/src/mapi \
+       -I$(top_srcdir)/src/mesa \
+       -I$(top_srcdir)/src/mesa/drivers/dri/common \
+       -I$(top_srcdir)/src/mesa/drivers/dri/i965 \
+       -I$(top_srcdir)/src/gallium/auxiliary \
+       -I$(top_srcdir)/src/gallium/include \
+       -I$(top_builddir)/src
+
+libisl_la_CFLAGS = $(CFLAGS) -Wno-override-init
+
+libisl_la_SOURCES =                                     \
+       isl.c                                           \
+       isl.h                                           \
+       isl_format.c                                    \
+       isl_format_layout.c                             \
+       isl_gen4.c                                      \
+       isl_gen4.h                                      \
+       isl_gen6.c                                      \
+       isl_gen6.h                                      \
+       isl_gen7.c                                      \
+       isl_gen7.h                                      \
+       isl_gen8.c                                      \
+       isl_gen8.h                                      \
+       isl_gen9.c                                      \
+       isl_gen9.h                                      \
+       isl_image.c                                     \
+       $(NULL)
+
+BUILT_SOURCES =                                         \
+       isl_format_layout.c
+
+isl_format_layout.c: isl_format_layout_gen.bash \
+                     isl_format_layout.csv
+       $(AM_V_GEN)$(srcdir)/isl_format_layout_gen.bash \
+           <$(srcdir)/isl_format_layout.csv >$@
+
+# ----------------------------------------------------------------------------
+#  Tests
+# ----------------------------------------------------------------------------
+
+TESTS = tests/isl_surf_get_image_offset_test
+
+check_PROGRAMS = $(TESTS)
+
+# Link tests to lib965_compiler.la for brw_get_device_info().
+tests_ldadd =                                          \
+       libisl.la                                       \
+       $(top_builddir)/src/mesa/drivers/dri/i965/libi965_compiler.la
+
+tests_isl_surf_get_image_offset_test_SOURCES =         \
+       tests/isl_surf_get_image_offset_test.c
+tests_isl_surf_get_image_offset_test_LDADD = $(tests_ldadd)
+
+# ----------------------------------------------------------------------------
+
+include $(top_srcdir)/install-lib-links.mk
diff --git a/src/isl/README b/src/isl/README

new file mode 100644 (file)

index 0000000..1ab4313
--- /dev/null
+++ b/src/isl/README
@@ -0,0 +1,113 @@
+Intel Surface Layout
+
+Introduction
+============
+isl is a small library that calculates the layout of Intel GPU surfaces, queries
+those layouts, and queries the properties of surface formats.
+
+
+Independence from User APIs
+===========================
+isl's API is independent of any user-facing graphics API, such as OpenGL and
+Vulkan. This independence allows isl to be used a shared component by multiple
+Intel drivers.
+
+Rather than mimic the user-facing APIs, the isl API attempts to reflect Intel
+hardware: the actual memory layout of Intel GPU surfaces and how one programs
+the GPU to use those surfaces. For example:
+
+  - The tokens of `enum isl_format` (such as `ISL_FORMAT_R8G8B8A8_UNORM`)
+    match those of the hardware enum `SURFACE_FORMAT` rather than the OpenGL
+    or Vulkan format tokens.  And the values of `isl_format` and
+    `SURFACE_FORMAT` are identical.
+
+  - The OpenGL and Vulkan APIs contain depth and stencil formats. However the
+    hardware enum `SURFACE_FORMAT` does not, and therefore neither does `enum
+    isl_format`. Rather than define new pixel formats that have no hardware
+    counterpart, isl records the intent to use a surface as a depth or stencil
+    buffer with the usage flags `ISL_SURF_USAGE_DEPTH_BIT` and
+    `ISL_SURF_USAGE_STENCIL_BIT`.
+
+  - `struct isl_surf` distinguishes between the surface's logical dimension
+    from the user API's perspective (`enum isl_surf_dim`, which may be 1D, 2D,
+    or 3D) and the layout of those dimensions in memory (`enum isl_dim_layout`).
+
+
+Surface Units
+=============
+
+Intro
+-----
+ISL takes care in its equations to correctly handle conversion among surface
+units (such as pixels and compression blocks) and to carefully distinguish
+between a surface's logical layout in the client API and its physical layout
+in memory.
+
+Symbol names often explicitly declare their unit with a suffix:
+
+   - px: logical pixels
+   - sa: physical surface samples
+   - el: physical surface elements
+   - sa_rows: rows of physical surface samples
+   - el_rows: rows of physical surface elements
+
+Logical units are independent of hardware generation and are closely related
+to the user-facing API (OpenGL and Vulkan). Physical units are dependent on
+hardware generation and reflect the surface's layout in memory.
+
+Definitions
+-----------
+- Logical Pixels (px):
+
+  The surface's layout from the perspective of the client API (OpenGL and
+  Vulkan) is in units of logical pixels. Logical pixels are independent of the
+  surface's layout in memory.
+
+  A surface's width and height, in units of logical pixels, is not affected by
+  the surface's sample count. For example, consider a VkImage created with
+  VkImageCreateInfo{width=w0, height=h0, samples=s0}. The surface's width and
+  height at level 0 is, in units of logical pixels, w0 and h0 regardless of
+  the value of s0.
+
+  For example, the logical array length of a 3D surface is always 1, even on
+  Gen9 where the surface's memory layout is that of an array surface
+  (ISL_DIM_LAYOUT_GEN4_2D).
+
+- Physical Surface Samples (sa):
+
+  For a multisampled surface, this unit has the obvious meaning.
+  A singlesampled surface, from ISL's perspective, is simply a multisampled
+  surface whose sample count is 1.
+
+  For example, consider a 2D single-level non-array surface with samples=4,
+  width_px=64, and height_px=64 (note that the suffix 'px' indicates logical
+  pixels). If the surface's multisample layout is ISL_MSAA_LAYOUT_INTERLEAVED,
+  then the extent of level 0 is, in units of physical surface samples,
+  width_sa=128, height_sa=128, depth_sa=1, array_length_sa=1. If
+  ISL_MSAA_LAYOUT_ARRAY, then width_sa=64, height_sa=64, depth_sa=1,
+  array_length_sa=4.
+
+- Physical Surface Elements (el):
+
+  This unit allows ISL to treat compressed and uncompressed formats
+  identically in many calculations.
+
+  If the surface's pixel format is compressed, such as ETC2, then a surface
+  element is equivalent to a compression block. If uncompressed, then
+  a surface element is equivalent to a surface sample. As a corollary, for
+  a given surface a surface element is at least as large as a surface sample.
+
+Errata
+------
+ISL acquired the term 'surface element' from the Broadwell PRM [1], which
+defines it as follows:
+
+   An element is defined as a pixel in uncompresed surface formats, and as
+   a compression block in compressed surface formats. For MSFMT_DEPTH_STENCIL
+   type multisampled surfaces, an element is a sample.
+
+
+References
+==========
+[1]: Broadwell PRM >> Volume 2d: Command Reference: Structures >>
+     RENDER_SURFACE_STATE Surface Vertical Alignment (p325)
diff --git a/src/isl/isl.c b/src/isl/isl.c

new file mode 100644 (file)

index 0000000..bb3d595
--- /dev/null
+++ b/src/isl/isl.c
@@ -0,0 +1,1272 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+
+#include "isl.h"
+#include "isl_gen4.h"
+#include "isl_gen6.h"
+#include "isl_gen7.h"
+#include "isl_gen8.h"
+#include "isl_gen9.h"
+#include "isl_priv.h"
+
+void PRINTFLIKE(3, 4) UNUSED
+__isl_finishme(const char *file, int line, const char *fmt, ...)
+{
+   va_list ap;
+   char buf[512];
+
+   va_start(ap, fmt);
+   vsnprintf(buf, sizeof(buf), fmt, ap);
+   va_end(ap);
+
+   fprintf(stderr, "%s:%d: FINISHME: %s\n", file, line, buf);
+}
+
+void
+isl_device_init(struct isl_device *dev,
+                const struct brw_device_info *info,
+                bool has_bit6_swizzling)
+{
+   dev->info = info;
+   dev->use_separate_stencil = ISL_DEV_GEN(dev) >= 6;
+   dev->has_bit6_swizzling = has_bit6_swizzling;
+
+   /* The ISL_DEV macros may be defined in the CFLAGS, thus hardcoding some
+    * device properties at buildtime. Verify that the macros with the device
+    * properties chosen during runtime.
+    */
+   assert(ISL_DEV_GEN(dev) == dev->info->gen);
+   assert(ISL_DEV_USE_SEPARATE_STENCIL(dev) == dev->use_separate_stencil);
+
+   /* Did we break hiz or stencil? */
+   if (ISL_DEV_USE_SEPARATE_STENCIL(dev))
+      assert(info->has_hiz_and_separate_stencil);
+   if (info->must_use_separate_stencil)
+      assert(ISL_DEV_USE_SEPARATE_STENCIL(dev));
+}
+
+/**
+ * @param[out] info is written only on success
+ */
+bool
+isl_tiling_get_info(const struct isl_device *dev,
+                    enum isl_tiling tiling,
+                    uint32_t format_block_size,
+                    struct isl_tile_info *tile_info)
+{
+   const uint32_t bs = format_block_size;
+   uint32_t width, height;
+
+   assert(bs > 0);
+
+   switch (tiling) {
+   case ISL_TILING_LINEAR:
+      width = 1;
+      height = 1;
+      break;
+
+   case ISL_TILING_X:
+      width = 1 << 9;
+      height = 1 << 3;
+      break;
+
+   case ISL_TILING_Y0:
+      width = 1 << 7;
+      height = 1 << 5;
+      break;
+
+   case ISL_TILING_W:
+      /* XXX: Should W tile be same as Y? */
+      width = 1 << 6;
+      height = 1 << 6;
+      break;
+
+   case ISL_TILING_Yf:
+   case ISL_TILING_Ys: {
+      if (ISL_DEV_GEN(dev) < 9)
+         return false;
+
+      if (!isl_is_pow2(bs))
+         return false;
+
+      bool is_Ys = tiling == ISL_TILING_Ys;
+
+      width = 1 << (6 + (ffs(bs) / 2) + (2 * is_Ys));
+      height = 1 << (6 - (ffs(bs) / 2) + (2 * is_Ys));
+      break;
+   }
+   } /* end switch */
+
+   *tile_info = (struct isl_tile_info) {
+      .tiling = tiling,
+      .width = width,
+      .height = height,
+      .size = width * height,
+   };
+
+   return true;
+}
+
+void
+isl_tiling_get_extent(const struct isl_device *dev,
+                      enum isl_tiling tiling,
+                      uint32_t format_block_size,
+                      struct isl_extent2d *e)
+{
+   struct isl_tile_info tile_info;
+   isl_tiling_get_info(dev, tiling, format_block_size, &tile_info);
+   *e = isl_extent2d(tile_info.width, tile_info.height);
+}
+
+/**
+ * @param[out] tiling is set only on success
+ */
+bool
+isl_surf_choose_tiling(const struct isl_device *dev,
+                       const struct isl_surf_init_info *restrict info,
+                       enum isl_tiling *tiling)
+{
+   isl_tiling_flags_t tiling_flags = info->tiling_flags;
+
+   if (ISL_DEV_GEN(dev) >= 7) {
+      gen7_filter_tiling(dev, info, &tiling_flags);
+   } else {
+      isl_finishme("%s: gen%u", __func__, ISL_DEV_GEN(dev));
+      gen7_filter_tiling(dev, info, &tiling_flags);
+   }
+
+   #define CHOOSE(__tiling) \
+      do { \
+         if (tiling_flags & (1u << (__tiling))) { \
+            *tiling = (__tiling); \
+            return true; \
+          } \
+      } while (0)
+
+   /* Of the tiling modes remaining, choose the one that offers the best
+    * performance.
+    */
+
+   if (info->dim == ISL_SURF_DIM_1D) {
+      /* Prefer linear for 1D surfaces because they do not benefit from
+       * tiling. To the contrary, tiling leads to wasted memory and poor
+       * memory locality due to the swizzling and alignment restrictions
+       * required in tiled surfaces.
+       */
+      CHOOSE(ISL_TILING_LINEAR);
+   }
+
+   CHOOSE(ISL_TILING_Ys);
+   CHOOSE(ISL_TILING_Yf);
+   CHOOSE(ISL_TILING_Y0);
+   CHOOSE(ISL_TILING_X);
+   CHOOSE(ISL_TILING_W);
+   CHOOSE(ISL_TILING_LINEAR);
+
+   #undef CHOOSE
+
+   /* No tiling mode accomodates the inputs. */
+   return false;
+}
+
+static bool
+isl_choose_msaa_layout(const struct isl_device *dev,
+                 const struct isl_surf_init_info *info,
+                 enum isl_tiling tiling,
+                 enum isl_msaa_layout *msaa_layout)
+{
+   if (ISL_DEV_GEN(dev) >= 8) {
+      return gen8_choose_msaa_layout(dev, info, tiling, msaa_layout);
+   } else if (ISL_DEV_GEN(dev) >= 7) {
+      return gen7_choose_msaa_layout(dev, info, tiling, msaa_layout);
+   } else if (ISL_DEV_GEN(dev) >= 6) {
+      return gen6_choose_msaa_layout(dev, info, tiling, msaa_layout);
+   } else {
+      return gen4_choose_msaa_layout(dev, info, tiling, msaa_layout);
+   }
+}
+
+static void
+isl_msaa_interleaved_scale_px_to_sa(uint32_t samples,
+                                    uint32_t *width, uint32_t *height)
+{
+   assert(isl_is_pow2(samples));
+
+   /* From the Broadwell PRM >> Volume 5: Memory Views >> Computing Mip Level
+    * Sizes (p133):
+    *
+    *    If the surface is multisampled and it is a depth or stencil surface
+    *    or Multisampled Surface StorageFormat in SURFACE_STATE is
+    *    MSFMT_DEPTH_STENCIL, W_L and H_L must be adjusted as follows before
+    *    proceeding: [...]
+    */
+   if (width)
+      *width = isl_align(*width, 2) << ((ffs(samples) - 0) / 2);
+   if (height)
+      *height = isl_align(*height, 2) << ((ffs(samples) - 1) / 2);
+}
+
+static enum isl_array_pitch_span
+isl_choose_array_pitch_span(const struct isl_device *dev,
+                            const struct isl_surf_init_info *restrict info,
+                            enum isl_dim_layout dim_layout,
+                            const struct isl_extent4d *phys_level0_sa)
+{
+   switch (dim_layout) {
+   case ISL_DIM_LAYOUT_GEN9_1D:
+   case ISL_DIM_LAYOUT_GEN4_2D:
+      if (ISL_DEV_GEN(dev) >= 8) {
+         /* QPitch becomes programmable in Broadwell. So choose the
+          * most compact QPitch possible in order to conserve memory.
+          *
+          * From the Broadwell PRM >> Volume 2d: Command Reference: Structures
+          * >> RENDER_SURFACE_STATE Surface QPitch (p325):
+          *
+          *    - Software must ensure that this field is set to a value
+          *      sufficiently large such that the array slices in the surface
+          *      do not overlap. Refer to the Memory Data Formats section for
+          *      information on how surfaces are stored in memory.
+          *
+          *    - This field specifies the distance in rows between array
+          *      slices.  It is used only in the following cases:
+          *
+          *          - Surface Array is enabled OR
+          *          - Number of Mulitsamples is not NUMSAMPLES_1 and
+          *            Multisampled Surface Storage Format set to MSFMT_MSS OR
+          *          - Surface Type is SURFTYPE_CUBE
+          */
+         return ISL_ARRAY_PITCH_SPAN_COMPACT;
+      } else if (ISL_DEV_GEN(dev) >= 7) {
+         /* Note that Ivybridge introduces
+          * RENDER_SURFACE_STATE.SurfaceArraySpacing, which provides the
+          * driver more control over the QPitch.
+          */
+
+         if (phys_level0_sa->array_len == 1) {
+            /* The hardware will never use the QPitch. So choose the most
+             * compact QPitch possible in order to conserve memory.
+             */
+            return ISL_ARRAY_PITCH_SPAN_COMPACT;
+         }
+
+         if (isl_surf_usage_is_depth_or_stencil(info->usage)) {
+            /* From the Ivybridge PRM >> Volume 1 Part 1: Graphics Core >>
+             * Section 6.18.4.7: Surface Arrays (p112):
+             *
+             *    If Surface Array Spacing is set to ARYSPC_FULL (note that
+             *    the depth buffer and stencil buffer have an implied value of
+             *    ARYSPC_FULL):
+             */
+            return ISL_ARRAY_PITCH_SPAN_COMPACT;
+         }
+
+         if (info->levels == 1) {
+            /* We are able to set RENDER_SURFACE_STATE.SurfaceArraySpacing
+             * to ARYSPC_LOD0.
+             */
+            return ISL_ARRAY_PITCH_SPAN_COMPACT;
+         }
+
+         return ISL_ARRAY_PITCH_SPAN_FULL;
+      } else if ((ISL_DEV_GEN(dev) == 5 || ISL_DEV_GEN(dev) == 6) &&
+                 ISL_DEV_USE_SEPARATE_STENCIL(dev) &&
+                 isl_surf_usage_is_stencil(info->usage)) {
+         /* [ILK-SNB] Errata from the Sandy Bridge PRM >> Volume 4 Part 1:
+          * Graphics Core >> Section 7.18.3.7: Surface Arrays:
+          *
+          *    The separate stencil buffer does not support mip mapping, thus
+          *    the storage for LODs other than LOD 0 is not needed.
+          */
+         assert(info->levels == 1);
+         assert(phys_level0_sa->array_len == 1);
+         return ISL_ARRAY_PITCH_SPAN_COMPACT;
+      } else {
+         if ((ISL_DEV_GEN(dev) == 5 || ISL_DEV_GEN(dev) == 6) &&
+             ISL_DEV_USE_SEPARATE_STENCIL(dev) &&
+             isl_surf_usage_is_stencil(info->usage)) {
+            /* [ILK-SNB] Errata from the Sandy Bridge PRM >> Volume 4 Part 1:
+             * Graphics Core >> Section 7.18.3.7: Surface Arrays:
+             *
+             *    The separate stencil buffer does not support mip mapping,
+             *    thus the storage for LODs other than LOD 0 is not needed.
+             */
+            assert(info->levels == 1);
+            assert(phys_level0_sa->array_len == 1);
+            return ISL_ARRAY_PITCH_SPAN_COMPACT;
+         }
+
+         if (phys_level0_sa->array_len == 1) {
+            /* The hardware will never use the QPitch. So choose the most
+             * compact QPitch possible in order to conserve memory.
+             */
+            return ISL_ARRAY_PITCH_SPAN_COMPACT;
+         }
+
+         return ISL_ARRAY_PITCH_SPAN_FULL;
+      }
+
+   case ISL_DIM_LAYOUT_GEN4_3D:
+      /* The hardware will never use the QPitch. So choose the most
+       * compact QPitch possible in order to conserve memory.
+       */
+      return ISL_ARRAY_PITCH_SPAN_COMPACT;
+   }
+
+   unreachable("bad isl_dim_layout");
+   return ISL_ARRAY_PITCH_SPAN_FULL;
+}
+
+static void
+isl_choose_image_alignment_el(const struct isl_device *dev,
+                              const struct isl_surf_init_info *restrict info,
+                              enum isl_tiling tiling,
+                              enum isl_msaa_layout msaa_layout,
+                              struct isl_extent3d *image_align_el)
+{
+   if (ISL_DEV_GEN(dev) >= 9) {
+      gen9_choose_image_alignment_el(dev, info, tiling, msaa_layout,
+                                     image_align_el);
+   } else if (ISL_DEV_GEN(dev) >= 8) {
+      gen8_choose_image_alignment_el(dev, info, tiling, msaa_layout,
+                                     image_align_el);
+   } else if (ISL_DEV_GEN(dev) >= 7) {
+      gen7_choose_image_alignment_el(dev, info, tiling, msaa_layout,
+                                     image_align_el);
+   } else if (ISL_DEV_GEN(dev) >= 6) {
+      gen6_choose_image_alignment_el(dev, info, tiling, msaa_layout,
+                                     image_align_el);
+   } else {
+      gen4_choose_image_alignment_el(dev, info, tiling, msaa_layout,
+                                     image_align_el);
+   }
+}
+
+static enum isl_dim_layout
+isl_surf_choose_dim_layout(const struct isl_device *dev,
+                           enum isl_surf_dim logical_dim)
+{
+   if (ISL_DEV_GEN(dev) >= 9) {
+      switch (logical_dim) {
+      case ISL_SURF_DIM_1D:
+         return ISL_DIM_LAYOUT_GEN9_1D;
+      case ISL_SURF_DIM_2D:
+      case ISL_SURF_DIM_3D:
+         return ISL_DIM_LAYOUT_GEN4_2D;
+      }
+   } else {
+      switch (logical_dim) {
+      case ISL_SURF_DIM_1D:
+      case ISL_SURF_DIM_2D:
+         return ISL_DIM_LAYOUT_GEN4_2D;
+      case ISL_SURF_DIM_3D:
+         return ISL_DIM_LAYOUT_GEN4_3D;
+      }
+   }
+
+   unreachable("bad isl_surf_dim");
+   return ISL_DIM_LAYOUT_GEN4_2D;
+}
+
+/**
+ * Calculate the physical extent of the surface's first level, in units of
+ * surface samples. The result is aligned to the format's compression block.
+ */
+static void
+isl_calc_phys_level0_extent_sa(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_dim_layout dim_layout,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent4d *phys_level0_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   if (isl_format_is_yuv(info->format))
+      isl_finishme("%s:%s: YUV format", __FILE__, __func__);
+
+   switch (info->dim) {
+   case ISL_SURF_DIM_1D:
+      assert(info->height == 1);
+      assert(info->depth == 1);
+      assert(info->samples == 1);
+      assert(!isl_format_is_compressed(info->format));
+
+      switch (dim_layout) {
+      case ISL_DIM_LAYOUT_GEN4_3D:
+         unreachable("bad isl_dim_layout");
+
+      case ISL_DIM_LAYOUT_GEN9_1D:
+      case ISL_DIM_LAYOUT_GEN4_2D:
+         *phys_level0_sa = (struct isl_extent4d) {
+            .w = info->width,
+            .h = 1,
+            .d = 1,
+            .a = info->array_len,
+         };
+         break;
+      }
+      break;
+
+   case ISL_SURF_DIM_2D:
+      assert(dim_layout == ISL_DIM_LAYOUT_GEN4_2D);
+
+      if (tiling == ISL_TILING_Ys && info->samples > 1)
+         isl_finishme("%s:%s: multisample TileYs layout", __FILE__, __func__);
+
+      switch (msaa_layout) {
+      case ISL_MSAA_LAYOUT_NONE:
+         assert(info->depth == 1);
+         assert(info->samples == 1);
+
+         *phys_level0_sa = (struct isl_extent4d) {
+            .w = isl_align(info->width, fmtl->bw),
+            .h = isl_align(info->height, fmtl->bh),
+            .d = 1,
+            .a = info->array_len,
+         };
+         break;
+
+      case ISL_MSAA_LAYOUT_ARRAY:
+         assert(info->depth == 1);
+         assert(info->array_len == 1);
+         assert(!isl_format_is_compressed(info->format));
+
+         *phys_level0_sa = (struct isl_extent4d) {
+            .w = info->width,
+            .h = info->height,
+            .d = 1,
+            .a = info->samples,
+         };
+         break;
+
+      case ISL_MSAA_LAYOUT_INTERLEAVED:
+         assert(info->depth == 1);
+         assert(info->array_len == 1);
+         assert(!isl_format_is_compressed(info->format));
+
+         *phys_level0_sa = (struct isl_extent4d) {
+            .w = info->width,
+            .h = info->height,
+            .d = 1,
+            .a = 1,
+         };
+
+         isl_msaa_interleaved_scale_px_to_sa(info->samples,
+                                             &phys_level0_sa->w,
+                                             &phys_level0_sa->h);
+         break;
+      }
+      break;
+
+   case ISL_SURF_DIM_3D:
+      assert(info->array_len == 1);
+      assert(info->samples == 1);
+
+      if (fmtl->bd > 1) {
+         isl_finishme("%s:%s: compression block with depth > 1",
+                      __FILE__, __func__);
+      }
+
+      switch (dim_layout) {
+      case ISL_DIM_LAYOUT_GEN9_1D:
+         unreachable("bad isl_dim_layout");
+
+      case ISL_DIM_LAYOUT_GEN4_2D:
+         assert(ISL_DEV_GEN(dev) >= 9);
+
+         *phys_level0_sa = (struct isl_extent4d) {
+            .w = isl_align(info->width, fmtl->bw),
+            .h = isl_align(info->height, fmtl->bh),
+            .d = 1,
+            .a = info->depth,
+         };
+         break;
+
+      case ISL_DIM_LAYOUT_GEN4_3D:
+         assert(ISL_DEV_GEN(dev) < 9);
+         *phys_level0_sa = (struct isl_extent4d) {
+            .w = isl_align(info->width, fmtl->bw),
+            .h = isl_align(info->height, fmtl->bh),
+            .d = info->depth,
+            .a = 1,
+         };
+         break;
+      }
+      break;
+   }
+}
+
+/**
+ * A variant of isl_calc_phys_slice0_extent_sa() specific to
+ * ISL_DIM_LAYOUT_GEN4_2D.
+ */
+static void
+isl_calc_phys_slice0_extent_sa_gen4_2d(
+      const struct isl_device *dev,
+      const struct isl_surf_init_info *restrict info,
+      enum isl_msaa_layout msaa_layout,
+      const struct isl_extent3d *image_align_sa,
+      const struct isl_extent4d *phys_level0_sa,
+      struct isl_extent2d *phys_slice0_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   assert(phys_level0_sa->depth == 1);
+
+   if (info->levels == 1 && msaa_layout != ISL_MSAA_LAYOUT_INTERLEAVED) {
+      /* Do not pad the surface to the image alignment. Instead, pad it only
+       * to the pixel format's block alignment.
+       *
+       * For tiled surfaces, using a reduced alignment here avoids wasting CPU
+       * cycles on the below mipmap layout caluclations. Reducing the
+       * alignment here is safe because we later align the row pitch and array
+       * pitch to the tile boundary. It is safe even for
+       * ISL_MSAA_LAYOUT_INTERLEAVED, because phys_level0_sa is already scaled
+       * to accomodate the interleaved samples.
+       *
+       * For linear surfaces, reducing the alignment here permits us to later
+       * choose an arbitrary, non-aligned row pitch. If the surface backs
+       * a VkBuffer, then an arbitrary pitch may be needed to accomodate
+       * VkBufferImageCopy::bufferRowLength.
+       */
+      *phys_slice0_sa = (struct isl_extent2d) {
+         .w = isl_align_npot(phys_level0_sa->w, fmtl->bw),
+         .h = isl_align_npot(phys_level0_sa->h, fmtl->bh),
+      };
+      return;
+   }
+
+   uint32_t slice_top_w = 0;
+   uint32_t slice_bottom_w = 0;
+   uint32_t slice_left_h = 0;
+   uint32_t slice_right_h = 0;
+
+   uint32_t W0 = phys_level0_sa->w;
+   uint32_t H0 = phys_level0_sa->h;
+
+   for (uint32_t l = 0; l < info->levels; ++l) {
+      uint32_t W = isl_minify(W0, l);
+      uint32_t H = isl_minify(H0, l);
+
+      if (msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
+         /* From the Broadwell PRM >> Volume 5: Memory Views >> Computing Mip Level
+          * Sizes (p133):
+          *
+          *    If the surface is multisampled and it is a depth or stencil
+          *    surface or Multisampled Surface StorageFormat in
+          *    SURFACE_STATE is MSFMT_DEPTH_STENCIL, W_L and H_L must be
+          *    adjusted as follows before proceeding: [...]
+          */
+         isl_msaa_interleaved_scale_px_to_sa(info->samples, &W, &H);
+      }
+
+      uint32_t w = isl_align_npot(W, image_align_sa->w);
+      uint32_t h = isl_align_npot(H, image_align_sa->h);
+
+      if (l == 0) {
+         slice_top_w = w;
+         slice_left_h = h;
+         slice_right_h = h;
+      } else if (l == 1) {
+         slice_bottom_w = w;
+         slice_left_h += h;
+      } else if (l == 2) {
+         slice_bottom_w += w;
+         slice_right_h += h;
+      } else {
+         slice_right_h += h;
+      }
+   }
+
+   *phys_slice0_sa = (struct isl_extent2d) {
+      .w = MAX(slice_top_w, slice_bottom_w),
+      .h = MAX(slice_left_h, slice_right_h),
+   };
+}
+
+/**
+ * A variant of isl_calc_phys_slice0_extent_sa() specific to
+ * ISL_DIM_LAYOUT_GEN4_3D.
+ */
+static void
+isl_calc_phys_slice0_extent_sa_gen4_3d(
+      const struct isl_device *dev,
+      const struct isl_surf_init_info *restrict info,
+      const struct isl_extent3d *image_align_sa,
+      const struct isl_extent4d *phys_level0_sa,
+      struct isl_extent2d *phys_slice0_sa)
+{
+   assert(info->samples == 1);
+   assert(phys_level0_sa->array_len == 1);
+
+   uint32_t slice_w = 0;
+   uint32_t slice_h = 0;
+
+   uint32_t W0 = phys_level0_sa->w;
+   uint32_t H0 = phys_level0_sa->h;
+   uint32_t D0 = phys_level0_sa->d;
+
+   for (uint32_t l = 0; l < info->levels; ++l) {
+      uint32_t level_w = isl_align_npot(isl_minify(W0, l), image_align_sa->w);
+      uint32_t level_h = isl_align_npot(isl_minify(H0, l), image_align_sa->h);
+      uint32_t level_d = isl_align_npot(isl_minify(D0, l), image_align_sa->d);
+
+      uint32_t max_layers_horiz = MIN(level_d, 1u << l);
+      uint32_t max_layers_vert = isl_align(level_d, 1u << l) / (1u << l);
+
+      slice_w = MAX(slice_w, level_w * max_layers_horiz);
+      slice_h += level_h * max_layers_vert;
+   }
+
+   *phys_slice0_sa = (struct isl_extent2d) {
+      .w = slice_w,
+      .h = slice_h,
+   };
+}
+
+/**
+ * A variant of isl_calc_phys_slice0_extent_sa() specific to
+ * ISL_DIM_LAYOUT_GEN9_1D.
+ */
+static void
+isl_calc_phys_slice0_extent_sa_gen9_1d(
+      const struct isl_device *dev,
+      const struct isl_surf_init_info *restrict info,
+      const struct isl_extent3d *image_align_sa,
+      const struct isl_extent4d *phys_level0_sa,
+      struct isl_extent2d *phys_slice0_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   assert(phys_level0_sa->height == 1);
+   assert(phys_level0_sa->depth == 1);
+   assert(info->samples == 1);
+   assert(image_align_sa->w >= fmtl->bw);
+
+   uint32_t slice_w = 0;
+   const uint32_t W0 = phys_level0_sa->w;
+
+   for (uint32_t l = 0; l < info->levels; ++l) {
+      uint32_t W = isl_minify(W0, l);
+      uint32_t w = isl_align_npot(W, image_align_sa->w);
+
+      slice_w += w;
+   }
+
+   *phys_slice0_sa = isl_extent2d(slice_w, 1);
+}
+
+/**
+ * Calculate the physical extent of the surface's first array slice, in units
+ * of surface samples. If the surface is multi-leveled, then the result will
+ * be aligned to \a image_align_sa.
+ */
+static void
+isl_calc_phys_slice0_extent_sa(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_dim_layout dim_layout,
+                               enum isl_msaa_layout msaa_layout,
+                               const struct isl_extent3d *image_align_sa,
+                               const struct isl_extent4d *phys_level0_sa,
+                               struct isl_extent2d *phys_slice0_sa)
+{
+   switch (dim_layout) {
+   case ISL_DIM_LAYOUT_GEN9_1D:
+      isl_calc_phys_slice0_extent_sa_gen9_1d(dev, info,
+                                             image_align_sa, phys_level0_sa,
+                                             phys_slice0_sa);
+      return;
+   case ISL_DIM_LAYOUT_GEN4_2D:
+      isl_calc_phys_slice0_extent_sa_gen4_2d(dev, info, msaa_layout,
+                                             image_align_sa, phys_level0_sa,
+                                             phys_slice0_sa);
+      return;
+   case ISL_DIM_LAYOUT_GEN4_3D:
+      isl_calc_phys_slice0_extent_sa_gen4_3d(dev, info, image_align_sa,
+                                             phys_level0_sa, phys_slice0_sa);
+      return;
+   }
+}
+
+/**
+ * Calculate the pitch between physical array slices, in units of rows of
+ * surface elements.
+ */
+static uint32_t
+isl_calc_array_pitch_el_rows(const struct isl_device *dev,
+                             const struct isl_surf_init_info *restrict info,
+                             const struct isl_tile_info *tile_info,
+                             enum isl_dim_layout dim_layout,
+                             enum isl_array_pitch_span array_pitch_span,
+                             const struct isl_extent3d *image_align_sa,
+                             const struct isl_extent4d *phys_level0_sa,
+                             const struct isl_extent2d *phys_slice0_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+   uint32_t pitch_sa_rows = 0;
+
+   switch (dim_layout) {
+   case ISL_DIM_LAYOUT_GEN9_1D:
+      /* Each row is an array slice */
+      pitch_sa_rows = 1;
+      break;
+   case ISL_DIM_LAYOUT_GEN4_2D:
+      switch (array_pitch_span) {
+      case ISL_ARRAY_PITCH_SPAN_COMPACT:
+         pitch_sa_rows = isl_align_npot(phys_slice0_sa->h, image_align_sa->h);
+         break;
+      case ISL_ARRAY_PITCH_SPAN_FULL: {
+         /* The QPitch equation is found in the Broadwell PRM >> Volume 5:
+          * Memory Views >> Common Surface Formats >> Surface Layout >> 2D
+          * Surfaces >> Surface Arrays.
+          */
+         uint32_t H0_sa = phys_level0_sa->h;
+         uint32_t H1_sa = isl_minify(H0_sa, 1);
+
+         uint32_t h0_sa = isl_align_npot(H0_sa, image_align_sa->h);
+         uint32_t h1_sa = isl_align_npot(H1_sa, image_align_sa->h);
+
+         uint32_t m;
+         if (ISL_DEV_GEN(dev) >= 7) {
+            /* The QPitch equation changed slightly in Ivybridge. */
+            m = 12;
+         } else {
+            m = 11;
+         }
+
+         pitch_sa_rows = h0_sa + h1_sa + (m * image_align_sa->h);
+
+         if (ISL_DEV_GEN(dev) == 6 && info->samples > 1 &&
+             (info->height % 4 == 1)) {
+            /* [SNB] Errata from the Sandy Bridge PRM >> Volume 4 Part 1:
+             * Graphics Core >> Section 7.18.3.7: Surface Arrays:
+             *
+             *    [SNB] Errata: Sampler MSAA Qpitch will be 4 greater than
+             *    the value calculated in the equation above , for every
+             *    other odd Surface Height starting from 1 i.e. 1,5,9,13.
+             *
+             * XXX(chadv): Is the errata natural corollary of the physical
+             * layout of interleaved samples?
+             */
+            pitch_sa_rows += 4;
+         }
+
+         pitch_sa_rows = isl_align_npot(pitch_sa_rows, fmtl->bh);
+         } /* end case */
+         break;
+      }
+      break;
+   case ISL_DIM_LAYOUT_GEN4_3D:
+      assert(array_pitch_span == ISL_ARRAY_PITCH_SPAN_COMPACT);
+      pitch_sa_rows = isl_align_npot(phys_slice0_sa->h, image_align_sa->h);
+      break;
+   default:
+      unreachable("bad isl_dim_layout");
+      break;
+   }
+
+   assert(pitch_sa_rows % fmtl->bh == 0);
+   uint32_t pitch_el_rows = pitch_sa_rows / fmtl->bh;
+
+   if (ISL_DEV_GEN(dev) >= 9 &&
+       info->dim == ISL_SURF_DIM_3D &&
+       tile_info->tiling != ISL_TILING_LINEAR) {
+      /* From the Skylake BSpec >> RENDER_SURFACE_STATE >> Surface QPitch:
+       *
+       *    Tile Mode != Linear: This field must be set to an integer multiple
+       *    of the tile height
+       */
+      pitch_el_rows = isl_align(pitch_el_rows, tile_info->height);
+   }
+
+   return pitch_el_rows;
+}
+
+/**
+ * Calculate the pitch of each surface row, in bytes.
+ */
+static uint32_t
+isl_calc_row_pitch(const struct isl_device *dev,
+                   const struct isl_surf_init_info *restrict info,
+                   const struct isl_tile_info *tile_info,
+                   const struct isl_extent3d *image_align_sa,
+                   const struct isl_extent2d *phys_slice0_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   uint32_t row_pitch = info->min_pitch;
+
+   /* First, align the surface to a cache line boundary, as the PRM explains
+    * below.
+    *
+    * From the Broadwell PRM >> Volume 5: Memory Views >> Common Surface
+    * Formats >> Surface Padding Requirements >> Render Target and Media
+    * Surfaces:
+    *
+    *    The data port accesses data (pixels) outside of the surface if they
+    *    are contained in the same cache request as pixels that are within the
+    *    surface. These pixels will not be returned by the requesting message,
+    *    however if these pixels lie outside of defined pages in the GTT,
+    *    a GTT error will result when the cache request is processed. In order
+    *    to avoid these GTT errors, “padding” at the bottom of the surface is
+    *    sometimes necessary.
+    *
+    * From the Broadwell PRM >> Volume 5: Memory Views >> Common Surface
+    * Formats >> Surface Padding Requirements >> Sampling Engine Surfaces:
+    *
+    *    The sampling engine accesses texels outside of the surface if they
+    *    are contained in the same cache line as texels that are within the
+    *    surface.  These texels will not participate in any calculation
+    *    performed by the sampling engine and will not affect the result of
+    *    any sampling engine operation, however if these texels lie outside of
+    *    defined pages in the GTT, a GTT error will result when the cache line
+    *    is accessed. In order to avoid these GTT errors, “padding” at the
+    *    bottom and right side of a sampling engine surface is sometimes
+    *    necessary.
+    *
+    *    It is possible that a cache line will straddle a page boundary if the
+    *    base address or pitch is not aligned. All pages included in the cache
+    *    lines that are part of the surface must map to valid GTT entries to
+    *    avoid errors. To determine the necessary padding on the bottom and
+    *    right side of the surface, refer to the table in  Alignment Unit Size
+    *    section for the i and j parameters for the surface format in use. The
+    *    surface must then be extended to the next multiple of the alignment
+    *    unit size in each dimension, and all texels contained in this
+    *    extended surface must have valid GTT entries.
+    *
+    *    For example, suppose the surface size is 15 texels by 10 texels and
+    *    the alignment parameters are i=4 and j=2. In this case, the extended
+    *    surface would be 16 by 10. Note that these calculations are done in
+    *    texels, and must be converted to bytes based on the surface format
+    *    being used to determine whether additional pages need to be defined.
+    */
+   assert(phys_slice0_sa->w % fmtl->bw == 0);
+   row_pitch = MAX(row_pitch, fmtl->bs * phys_slice0_sa->w);
+
+   switch (tile_info->tiling) {
+   case ISL_TILING_LINEAR:
+      /* From the Broadwel PRM >> Volume 2d: Command Reference: Structures >>
+       * RENDER_SURFACE_STATE Surface Pitch (p349):
+       *
+       *    - For linear render target surfaces and surfaces accessed with the
+       *      typed data port messages, the pitch must be a multiple of the
+       *      element size for non-YUV surface formats.  Pitch must be
+       *      a multiple of 2 * element size for YUV surface formats.
+       *
+       *    - [Requirements for SURFTYPE_BUFFER and SURFTYPE_STRBUF, which we
+       *      ignore because isl doesn't do buffers.]
+       *
+       *    - For other linear surfaces, the pitch can be any multiple of
+       *      bytes.
+       */
+      if (info->usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) {
+         if (isl_format_is_yuv(info->format)) {
+            row_pitch = isl_align_npot(row_pitch, 2 * fmtl->bs);
+         } else  {
+            row_pitch = isl_align_npot(row_pitch, fmtl->bs);
+         }
+      }
+      break;
+   default:
+      /* From the Broadwel PRM >> Volume 2d: Command Reference: Structures >>
+       * RENDER_SURFACE_STATE Surface Pitch (p349):
+       *
+       *    - For tiled surfaces, the pitch must be a multiple of the tile
+       *      width.
+       */
+      row_pitch = isl_align(row_pitch, tile_info->width);
+      break;
+   }
+
+   return row_pitch;
+}
+
+/**
+ * Calculate the surface's total height, including padding, in units of
+ * surface elements.
+ */
+static uint32_t
+isl_calc_total_height_el(const struct isl_device *dev,
+                         const struct isl_surf_init_info *restrict info,
+                         const struct isl_tile_info *tile_info,
+                         uint32_t phys_array_len,
+                         uint32_t row_pitch,
+                         uint32_t array_pitch_el_rows)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   uint32_t total_h_el = phys_array_len * array_pitch_el_rows;
+   uint32_t pad_bytes = 0;
+
+   /* From the Broadwell PRM >> Volume 5: Memory Views >> Common Surface
+    * Formats >> Surface Padding Requirements >> Render Target and Media
+    * Surfaces:
+    *
+    *   The data port accesses data (pixels) outside of the surface if they
+    *   are contained in the same cache request as pixels that are within the
+    *   surface. These pixels will not be returned by the requesting message,
+    *   however if these pixels lie outside of defined pages in the GTT,
+    *   a GTT error will result when the cache request is processed. In
+    *   order to avoid these GTT errors, “padding” at the bottom of the
+    *   surface is sometimes necessary.
+    *
+    * From the Broadwell PRM >> Volume 5: Memory Views >> Common Surface
+    * Formats >> Surface Padding Requirements >> Sampling Engine Surfaces:
+    *
+    *    ... Lots of padding requirements, all listed separately below.
+    */
+
+   /* We can safely ignore the first padding requirement, quoted below,
+    * because isl doesn't do buffers.
+    *
+    *    - [pre-BDW] For buffers, which have no inherent “height,” padding
+    *      requirements are different. A buffer must be padded to the next
+    *      multiple of 256 array elements, with an additional 16 bytes added
+    *      beyond that to account for the L1 cache line.
+    */
+
+   /*
+    *    - For compressed textures [...], padding at the bottom of the surface
+    *      is to an even compressed row.
+    */
+   if (isl_format_is_compressed(info->format))
+      total_h_el = isl_align(total_h_el, 2);
+
+   /*
+    *    - For cube surfaces, an additional two rows of padding are required
+    *      at the bottom of the surface.
+    */
+   if (info->usage & ISL_SURF_USAGE_CUBE_BIT)
+      total_h_el += 2;
+
+   /*
+    *    - For packed YUV, 96 bpt, 48 bpt, and 24 bpt surface formats,
+    *      additional padding is required. These surfaces require an extra row
+    *      plus 16 bytes of padding at the bottom in addition to the general
+    *      padding requirements.
+    */
+   if (isl_format_is_yuv(info->format) &&
+       (fmtl->bs == 96 || fmtl->bs == 48|| fmtl->bs == 24)) {
+      total_h_el += 1;
+      pad_bytes += 16;
+   }
+
+   /*
+    *    - For linear surfaces, additional padding of 64 bytes is required at
+    *      the bottom of the surface. This is in addition to the padding
+    *      required above.
+    */
+   if (tile_info->tiling == ISL_TILING_LINEAR)
+      pad_bytes += 64;
+
+   /* The below text weakens, not strengthens, the padding requirements for
+    * linear surfaces. Therefore we can safely ignore it.
+    *
+    *    - [BDW+] For SURFTYPE_BUFFER, SURFTYPE_1D, and SURFTYPE_2D non-array,
+    *      non-MSAA, non-mip-mapped surfaces in linear memory, the only
+    *      padding requirement is to the next aligned 64-byte boundary beyond
+    *      the end of the surface. The rest of the padding requirements
+    *      documented above do not apply to these surfaces.
+    */
+
+   /*
+    *    - [SKL+] For SURFTYPE_2D and SURFTYPE_3D with linear mode and
+    *      height % 4 != 0, the surface must be padded with
+    *      4-(height % 4)*Surface Pitch # of bytes.
+    */
+   if (ISL_DEV_GEN(dev) >= 9 &&
+       tile_info->tiling == ISL_TILING_LINEAR &&
+       (info->dim == ISL_SURF_DIM_2D || info->dim == ISL_SURF_DIM_3D)) {
+      total_h_el = isl_align(total_h_el, 4);
+   }
+
+   /*
+    *    - [SKL+] For SURFTYPE_1D with linear mode, the surface must be padded
+    *      to 4 times the Surface Pitch # of bytes
+    */
+   if (ISL_DEV_GEN(dev) >= 9 &&
+       tile_info->tiling == ISL_TILING_LINEAR &&
+       info->dim == ISL_SURF_DIM_1D) {
+      total_h_el += 4;
+   }
+
+   /* Be sloppy. Align any leftover padding to a row boundary. */
+   total_h_el += isl_align_div_npot(pad_bytes, row_pitch);
+
+   return total_h_el;
+}
+
+bool
+isl_surf_init_s(const struct isl_device *dev,
+                struct isl_surf *surf,
+                const struct isl_surf_init_info *restrict info)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   const struct isl_extent4d logical_level0_px = {
+      .w = info->width,
+      .h = info->height,
+      .d = info->depth,
+      .a = info->array_len,
+   };
+
+   enum isl_dim_layout dim_layout =
+      isl_surf_choose_dim_layout(dev, info->dim);
+
+   enum isl_tiling tiling;
+   if (!isl_surf_choose_tiling(dev, info, &tiling))
+      return false;
+
+   struct isl_tile_info tile_info;
+   if (!isl_tiling_get_info(dev, tiling, fmtl->bs, &tile_info))
+      return false;
+
+   enum isl_msaa_layout msaa_layout;
+   if (!isl_choose_msaa_layout(dev, info, tiling, &msaa_layout))
+       return false;
+
+   struct isl_extent3d image_align_el;
+   isl_choose_image_alignment_el(dev, info, tiling, msaa_layout,
+                                 &image_align_el);
+
+   struct isl_extent3d image_align_sa =
+      isl_extent3d_el_to_sa(info->format, image_align_el);
+
+   struct isl_extent4d phys_level0_sa;
+   isl_calc_phys_level0_extent_sa(dev, info, dim_layout, tiling, msaa_layout,
+                                  &phys_level0_sa);
+   assert(phys_level0_sa.w % fmtl->bw == 0);
+   assert(phys_level0_sa.h % fmtl->bh == 0);
+
+   enum isl_array_pitch_span array_pitch_span =
+      isl_choose_array_pitch_span(dev, info, dim_layout, &phys_level0_sa);
+
+   struct isl_extent2d phys_slice0_sa;
+   isl_calc_phys_slice0_extent_sa(dev, info, dim_layout, msaa_layout,
+                                  &image_align_sa, &phys_level0_sa,
+                                  &phys_slice0_sa);
+   assert(phys_slice0_sa.w % fmtl->bw == 0);
+   assert(phys_slice0_sa.h % fmtl->bh == 0);
+
+   const uint32_t row_pitch = isl_calc_row_pitch(dev, info, &tile_info,
+                                                 &image_align_sa,
+                                                 &phys_slice0_sa);
+
+   const uint32_t array_pitch_el_rows =
+      isl_calc_array_pitch_el_rows(dev, info, &tile_info, dim_layout,
+                                   array_pitch_span, &image_align_sa,
+                                   &phys_level0_sa, &phys_slice0_sa);
+
+   const uint32_t total_h_el =
+      isl_calc_total_height_el(dev, info, &tile_info,
+                               phys_level0_sa.array_len, row_pitch,
+                               array_pitch_el_rows);
+
+   const uint32_t total_h_sa = total_h_el * fmtl->bh;
+   const uint32_t size = row_pitch * isl_align(total_h_sa, tile_info.height);
+
+   /* Alignment of surface base address, in bytes */
+   uint32_t base_alignment = MAX(1, info->min_alignment);
+   assert(isl_is_pow2(base_alignment) && isl_is_pow2(tile_info.size));
+   base_alignment = MAX(base_alignment, tile_info.size);
+
+   *surf = (struct isl_surf) {
+      .dim = info->dim,
+      .dim_layout = dim_layout,
+      .msaa_layout = msaa_layout,
+      .tiling = tiling,
+      .format = info->format,
+
+      .levels = info->levels,
+      .samples = info->samples,
+
+      .image_alignment_el = image_align_el,
+      .logical_level0_px = logical_level0_px,
+      .phys_level0_sa = phys_level0_sa,
+
+      .size = size,
+      .alignment = base_alignment,
+      .row_pitch = row_pitch,
+      .array_pitch_el_rows = array_pitch_el_rows,
+      .array_pitch_span = array_pitch_span,
+
+      .usage = info->usage,
+   };
+
+   return true;
+}
+
+/**
+ * A variant of isl_surf_get_image_offset_sa() specific to
+ * ISL_DIM_LAYOUT_GEN4_2D.
+ */
+static void
+get_image_offset_sa_gen4_2d(const struct isl_surf *surf,
+                            uint32_t level, uint32_t layer,
+                            uint32_t *x_offset_sa,
+                            uint32_t *y_offset_sa)
+{
+   assert(level < surf->levels);
+   assert(layer < surf->phys_level0_sa.array_len);
+   assert(surf->phys_level0_sa.depth == 1);
+
+   const struct isl_extent3d image_align_sa =
+      isl_surf_get_image_alignment_sa(surf);
+
+   const uint32_t W0 = surf->phys_level0_sa.width;
+   const uint32_t H0 = surf->phys_level0_sa.height;
+
+   uint32_t x = 0;
+   uint32_t y = layer * isl_surf_get_array_pitch_sa_rows(surf);
+
+   for (uint32_t l = 0; l < level; ++l) {
+      if (l == 1) {
+         uint32_t W = isl_minify(W0, l);
+
+         if (surf->msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED)
+            isl_msaa_interleaved_scale_px_to_sa(surf->samples, &W, NULL);
+
+         x += isl_align_npot(W, image_align_sa.w);
+      } else {
+         uint32_t H = isl_minify(H0, l);
+
+         if (surf->msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED)
+            isl_msaa_interleaved_scale_px_to_sa(surf->samples, NULL, &H);
+
+         y += isl_align_npot(H, image_align_sa.h);
+      }
+   }
+
+   *x_offset_sa = x;
+   *y_offset_sa = y;
+}
+
+/**
+ * A variant of isl_surf_get_image_offset_sa() specific to
+ * ISL_DIM_LAYOUT_GEN4_3D.
+ */
+static void
+get_image_offset_sa_gen4_3d(const struct isl_surf *surf,
+                            uint32_t level, uint32_t logical_z_offset_px,
+                            uint32_t *x_offset_sa,
+                            uint32_t *y_offset_sa)
+{
+   assert(level < surf->levels);
+   assert(logical_z_offset_px < isl_minify(surf->phys_level0_sa.depth, level));
+   assert(surf->phys_level0_sa.array_len == 1);
+
+   const struct isl_extent3d image_align_sa =
+      isl_surf_get_image_alignment_sa(surf);
+
+   const uint32_t W0 = surf->phys_level0_sa.width;
+   const uint32_t H0 = surf->phys_level0_sa.height;
+   const uint32_t D0 = surf->phys_level0_sa.depth;
+
+   uint32_t x = 0;
+   uint32_t y = 0;
+
+   for (uint32_t l = 0; l < level; ++l) {
+      const uint32_t level_h = isl_align_npot(isl_minify(H0, l), image_align_sa.h);
+      const uint32_t level_d = isl_align_npot(isl_minify(D0, l), image_align_sa.d);
+      const uint32_t max_layers_vert = isl_align(level_d, 1u << l) / (1u << l);
+
+      y += level_h * max_layers_vert;
+   }
+
+   const uint32_t level_w = isl_align_npot(isl_minify(W0, level), image_align_sa.w);
+   const uint32_t level_h = isl_align_npot(isl_minify(H0, level), image_align_sa.h);
+   const uint32_t level_d = isl_align_npot(isl_minify(D0, level), image_align_sa.d);
+
+   const uint32_t max_layers_horiz = MIN(level_d, 1u << level);
+   const uint32_t max_layers_vert = isl_align_div(level_d, 1u << level);
+
+   x += level_w * (logical_z_offset_px % max_layers_horiz);
+   y += level_h * (logical_z_offset_px / max_layers_vert);
+
+   *x_offset_sa = x;
+   *y_offset_sa = y;
+}
+
+/**
+ * A variant of isl_surf_get_image_offset_sa() specific to
+ * ISL_DIM_LAYOUT_GEN9_1D.
+ */
+static void
+get_image_offset_sa_gen9_1d(const struct isl_surf *surf,
+                            uint32_t level, uint32_t layer,
+                            uint32_t *x_offset_sa,
+                            uint32_t *y_offset_sa)
+{
+   assert(level < surf->levels);
+   assert(layer < surf->phys_level0_sa.array_len);
+   assert(surf->phys_level0_sa.height == 1);
+   assert(surf->phys_level0_sa.depth == 1);
+   assert(surf->samples == 1);
+
+   const uint32_t W0 = surf->phys_level0_sa.width;
+   const struct isl_extent3d image_align_sa =
+      isl_surf_get_image_alignment_sa(surf);
+
+   uint32_t x = layer * isl_surf_get_array_pitch_sa_rows(surf);
+
+   for (uint32_t l = 0; l < level; ++l) {
+      uint32_t W = isl_minify(W0, l);
+      uint32_t w = isl_align_npot(W, image_align_sa.w);
+
+      x += w;
+   }
+
+   *x_offset_sa = x;
+   *y_offset_sa = 0;
+}
+
+void
+isl_surf_get_image_offset_sa(const struct isl_surf *surf,
+                             uint32_t level,
+                             uint32_t logical_array_layer,
+                             uint32_t logical_z_offset_px,
+                             uint32_t *x_offset_sa,
+                             uint32_t *y_offset_sa)
+{
+   assert(level < surf->levels);
+   assert(logical_array_layer < surf->logical_level0_px.array_len);
+   assert(logical_z_offset_px
+          < isl_minify(surf->logical_level0_px.depth, level));
+
+   switch (surf->dim_layout) {
+   case ISL_DIM_LAYOUT_GEN9_1D:
+      get_image_offset_sa_gen9_1d(surf, level, logical_array_layer,
+                                  x_offset_sa, y_offset_sa);
+      break;
+   case ISL_DIM_LAYOUT_GEN4_2D:
+      get_image_offset_sa_gen4_2d(surf, level, logical_array_layer,
+                                  x_offset_sa, y_offset_sa);
+      break;
+   case ISL_DIM_LAYOUT_GEN4_3D:
+      get_image_offset_sa_gen4_3d(surf, level, logical_z_offset_px,
+                                  x_offset_sa, y_offset_sa);
+      break;
+   }
+}
diff --git a/src/isl/isl.h b/src/isl/isl.h

new file mode 100644 (file)

index 0000000..4b3f179
--- /dev/null
+++ b/src/isl/isl.h
@@ -0,0 +1,969 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file
+ * @brief Intel Surface Layout
+ *
+ * Header Layout
+ * -------------
+ * The header is ordered as:
+ *    - forward declarations
+ *    - macros that may be overridden at compile-time for specific gens
+ *    - enums and constants
+ *    - structs and unions
+ *    - functions
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "util/macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct brw_device_info;
+struct brw_image_param;
+
+#ifndef ISL_DEV_GEN
+/**
+ * @brief Get the hardware generation of isl_device.
+ *
+ * You can define this as a compile-time constant in the CFLAGS. For example,
+ * `gcc -DISL_DEV_GEN(dev)=9 ...`.
+ */
+#define ISL_DEV_GEN(__dev) ((__dev)->info->gen)
+#endif
+
+#ifndef ISL_DEV_USE_SEPARATE_STENCIL
+/**
+ * You can define this as a compile-time constant in the CFLAGS. For example,
+ * `gcc -DISL_DEV_USE_SEPARATE_STENCIL(dev)=1 ...`.
+ */
+#define ISL_DEV_USE_SEPARATE_STENCIL(__dev) ((__dev)->use_separate_stencil)
+#endif
+
+/**
+ * Hardware enumeration SURFACE_FORMAT.
+ *
+ * For the official list, see Broadwell PRM: Volume 2b: Command Reference:
+ * Enumerations: SURFACE_FORMAT.
+ */
+enum isl_format {
+   ISL_FORMAT_R32G32B32A32_FLOAT =                               0,
+   ISL_FORMAT_R32G32B32A32_SINT =                                1,
+   ISL_FORMAT_R32G32B32A32_UINT =                                2,
+   ISL_FORMAT_R32G32B32A32_UNORM =                               3,
+   ISL_FORMAT_R32G32B32A32_SNORM =                               4,
+   ISL_FORMAT_R64G64_FLOAT =                                     5,
+   ISL_FORMAT_R32G32B32X32_FLOAT =                               6,
+   ISL_FORMAT_R32G32B32A32_SSCALED =                             7,
+   ISL_FORMAT_R32G32B32A32_USCALED =                             8,
+   ISL_FORMAT_R32G32B32A32_SFIXED =                             32,
+   ISL_FORMAT_R64G64_PASSTHRU =                                 33,
+   ISL_FORMAT_R32G32B32_FLOAT =                                 64,
+   ISL_FORMAT_R32G32B32_SINT =                                  65,
+   ISL_FORMAT_R32G32B32_UINT =                                  66,
+   ISL_FORMAT_R32G32B32_UNORM =                                 67,
+   ISL_FORMAT_R32G32B32_SNORM =                                 68,
+   ISL_FORMAT_R32G32B32_SSCALED =                               69,
+   ISL_FORMAT_R32G32B32_USCALED =                               70,
+   ISL_FORMAT_R32G32B32_SFIXED =                                80,
+   ISL_FORMAT_R16G16B16A16_UNORM =                             128,
+   ISL_FORMAT_R16G16B16A16_SNORM =                             129,
+   ISL_FORMAT_R16G16B16A16_SINT =                              130,
+   ISL_FORMAT_R16G16B16A16_UINT =                              131,
+   ISL_FORMAT_R16G16B16A16_FLOAT =                             132,
+   ISL_FORMAT_R32G32_FLOAT =                                   133,
+   ISL_FORMAT_R32G32_SINT =                                    134,
+   ISL_FORMAT_R32G32_UINT =                                    135,
+   ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS =                       136,
+   ISL_FORMAT_X32_TYPELESS_G8X24_UINT =                        137,
+   ISL_FORMAT_L32A32_FLOAT =                                   138,
+   ISL_FORMAT_R32G32_UNORM =                                   139,
+   ISL_FORMAT_R32G32_SNORM =                                   140,
+   ISL_FORMAT_R64_FLOAT =                                      141,
+   ISL_FORMAT_R16G16B16X16_UNORM =                             142,
+   ISL_FORMAT_R16G16B16X16_FLOAT =                             143,
+   ISL_FORMAT_A32X32_FLOAT =                                   144,
+   ISL_FORMAT_L32X32_FLOAT =                                   145,
+   ISL_FORMAT_I32X32_FLOAT =                                   146,
+   ISL_FORMAT_R16G16B16A16_SSCALED =                           147,
+   ISL_FORMAT_R16G16B16A16_USCALED =                           148,
+   ISL_FORMAT_R32G32_SSCALED =                                 149,
+   ISL_FORMAT_R32G32_USCALED =                                 150,
+   ISL_FORMAT_R32G32_SFIXED =                                  160,
+   ISL_FORMAT_R64_PASSTHRU =                                   161,
+   ISL_FORMAT_B8G8R8A8_UNORM =                                 192,
+   ISL_FORMAT_B8G8R8A8_UNORM_SRGB =                            193,
+   ISL_FORMAT_R10G10B10A2_UNORM =                              194,
+   ISL_FORMAT_R10G10B10A2_UNORM_SRGB =                         195,
+   ISL_FORMAT_R10G10B10A2_UINT =                               196,
+   ISL_FORMAT_R10G10B10_SNORM_A2_UNORM =                       197,
+   ISL_FORMAT_R8G8B8A8_UNORM =                                 199,
+   ISL_FORMAT_R8G8B8A8_UNORM_SRGB =                            200,
+   ISL_FORMAT_R8G8B8A8_SNORM =                                 201,
+   ISL_FORMAT_R8G8B8A8_SINT =                                  202,
+   ISL_FORMAT_R8G8B8A8_UINT =                                  203,
+   ISL_FORMAT_R16G16_UNORM =                                   204,
+   ISL_FORMAT_R16G16_SNORM =                                   205,
+   ISL_FORMAT_R16G16_SINT =                                    206,
+   ISL_FORMAT_R16G16_UINT =                                    207,
+   ISL_FORMAT_R16G16_FLOAT =                                   208,
+   ISL_FORMAT_B10G10R10A2_UNORM =                              209,
+   ISL_FORMAT_B10G10R10A2_UNORM_SRGB =                         210,
+   ISL_FORMAT_R11G11B10_FLOAT =                                211,
+   ISL_FORMAT_R32_SINT =                                       214,
+   ISL_FORMAT_R32_UINT =                                       215,
+   ISL_FORMAT_R32_FLOAT =                                      216,
+   ISL_FORMAT_R24_UNORM_X8_TYPELESS =                          217,
+   ISL_FORMAT_X24_TYPELESS_G8_UINT =                           218,
+   ISL_FORMAT_L32_UNORM =                                      221,
+   ISL_FORMAT_A32_UNORM =                                      222,
+   ISL_FORMAT_L16A16_UNORM =                                   223,
+   ISL_FORMAT_I24X8_UNORM =                                    224,
+   ISL_FORMAT_L24X8_UNORM =                                    225,
+   ISL_FORMAT_A24X8_UNORM =                                    226,
+   ISL_FORMAT_I32_FLOAT =                                      227,
+   ISL_FORMAT_L32_FLOAT =                                      228,
+   ISL_FORMAT_A32_FLOAT =                                      229,
+   ISL_FORMAT_X8B8_UNORM_G8R8_SNORM =                          230,
+   ISL_FORMAT_A8X8_UNORM_G8R8_SNORM =                          231,
+   ISL_FORMAT_B8X8_UNORM_G8R8_SNORM =                          232,
+   ISL_FORMAT_B8G8R8X8_UNORM =                                 233,
+   ISL_FORMAT_B8G8R8X8_UNORM_SRGB =                            234,
+   ISL_FORMAT_R8G8B8X8_UNORM =                                 235,
+   ISL_FORMAT_R8G8B8X8_UNORM_SRGB =                            236,
+   ISL_FORMAT_R9G9B9E5_SHAREDEXP =                             237,
+   ISL_FORMAT_B10G10R10X2_UNORM =                              238,
+   ISL_FORMAT_L16A16_FLOAT =                                   240,
+   ISL_FORMAT_R32_UNORM =                                      241,
+   ISL_FORMAT_R32_SNORM =                                      242,
+   ISL_FORMAT_R10G10B10X2_USCALED =                            243,
+   ISL_FORMAT_R8G8B8A8_SSCALED =                               244,
+   ISL_FORMAT_R8G8B8A8_USCALED =                               245,
+   ISL_FORMAT_R16G16_SSCALED =                                 246,
+   ISL_FORMAT_R16G16_USCALED =                                 247,
+   ISL_FORMAT_R32_SSCALED =                                    248,
+   ISL_FORMAT_R32_USCALED =                                    249,
+   ISL_FORMAT_B5G6R5_UNORM =                                   256,
+   ISL_FORMAT_B5G6R5_UNORM_SRGB =                              257,
+   ISL_FORMAT_B5G5R5A1_UNORM =                                 258,
+   ISL_FORMAT_B5G5R5A1_UNORM_SRGB =                            259,
+   ISL_FORMAT_B4G4R4A4_UNORM =                                 260,
+   ISL_FORMAT_B4G4R4A4_UNORM_SRGB =                            261,
+   ISL_FORMAT_R8G8_UNORM =                                     262,
+   ISL_FORMAT_R8G8_SNORM =                                     263,
+   ISL_FORMAT_R8G8_SINT =                                      264,
+   ISL_FORMAT_R8G8_UINT =                                      265,
+   ISL_FORMAT_R16_UNORM =                                      266,
+   ISL_FORMAT_R16_SNORM =                                      267,
+   ISL_FORMAT_R16_SINT =                                       268,
+   ISL_FORMAT_R16_UINT =                                       269,
+   ISL_FORMAT_R16_FLOAT =                                      270,
+   ISL_FORMAT_A8P8_UNORM_PALETTE0 =                            271,
+   ISL_FORMAT_A8P8_UNORM_PALETTE1 =                            272,
+   ISL_FORMAT_I16_UNORM =                                      273,
+   ISL_FORMAT_L16_UNORM =                                      274,
+   ISL_FORMAT_A16_UNORM =                                      275,
+   ISL_FORMAT_L8A8_UNORM =                                     276,
+   ISL_FORMAT_I16_FLOAT =                                      277,
+   ISL_FORMAT_L16_FLOAT =                                      278,
+   ISL_FORMAT_A16_FLOAT =                                      279,
+   ISL_FORMAT_L8A8_UNORM_SRGB =                                280,
+   ISL_FORMAT_R5G5_SNORM_B6_UNORM =                            281,
+   ISL_FORMAT_B5G5R5X1_UNORM =                                 282,
+   ISL_FORMAT_B5G5R5X1_UNORM_SRGB =                            283,
+   ISL_FORMAT_R8G8_SSCALED =                                   284,
+   ISL_FORMAT_R8G8_USCALED =                                   285,
+   ISL_FORMAT_R16_SSCALED =                                    286,
+   ISL_FORMAT_R16_USCALED =                                    287,
+   ISL_FORMAT_P8A8_UNORM_PALETTE0 =                            290,
+   ISL_FORMAT_P8A8_UNORM_PALETTE1 =                            291,
+   ISL_FORMAT_A1B5G5R5_UNORM =                                 292,
+   ISL_FORMAT_A4B4G4R4_UNORM =                                 293,
+   ISL_FORMAT_L8A8_UINT =                                      294,
+   ISL_FORMAT_L8A8_SINT =                                      295,
+   ISL_FORMAT_R8_UNORM =                                       320,
+   ISL_FORMAT_R8_SNORM =                                       321,
+   ISL_FORMAT_R8_SINT =                                        322,
+   ISL_FORMAT_R8_UINT =                                        323,
+   ISL_FORMAT_A8_UNORM =                                       324,
+   ISL_FORMAT_I8_UNORM =                                       325,
+   ISL_FORMAT_L8_UNORM =                                       326,
+   ISL_FORMAT_P4A4_UNORM_PALETTE0 =                            327,
+   ISL_FORMAT_A4P4_UNORM_PALETTE0 =                            328,
+   ISL_FORMAT_R8_SSCALED =                                     329,
+   ISL_FORMAT_R8_USCALED =                                     330,
+   ISL_FORMAT_P8_UNORM_PALETTE0 =                              331,
+   ISL_FORMAT_L8_UNORM_SRGB =                                  332,
+   ISL_FORMAT_P8_UNORM_PALETTE1 =                              333,
+   ISL_FORMAT_P4A4_UNORM_PALETTE1 =                            334,
+   ISL_FORMAT_A4P4_UNORM_PALETTE1 =                            335,
+   ISL_FORMAT_Y8_UNORM =                                       336,
+   ISL_FORMAT_L8_UINT =                                        338,
+   ISL_FORMAT_L8_SINT =                                        339,
+   ISL_FORMAT_I8_UINT =                                        340,
+   ISL_FORMAT_I8_SINT =                                        341,
+   ISL_FORMAT_DXT1_RGB_SRGB =                                  384,
+   ISL_FORMAT_R1_UNORM =                                       385,
+   ISL_FORMAT_YCRCB_NORMAL =                                   386,
+   ISL_FORMAT_YCRCB_SWAPUVY =                                  387,
+   ISL_FORMAT_P2_UNORM_PALETTE0 =                              388,
+   ISL_FORMAT_P2_UNORM_PALETTE1 =                              389,
+   ISL_FORMAT_BC1_UNORM =                                      390,
+   ISL_FORMAT_BC2_UNORM =                                      391,
+   ISL_FORMAT_BC3_UNORM =                                      392,
+   ISL_FORMAT_BC4_UNORM =                                      393,
+   ISL_FORMAT_BC5_UNORM =                                      394,
+   ISL_FORMAT_BC1_UNORM_SRGB =                                 395,
+   ISL_FORMAT_BC2_UNORM_SRGB =                                 396,
+   ISL_FORMAT_BC3_UNORM_SRGB =                                 397,
+   ISL_FORMAT_MONO8 =                                          398,
+   ISL_FORMAT_YCRCB_SWAPUV =                                   399,
+   ISL_FORMAT_YCRCB_SWAPY =                                    400,
+   ISL_FORMAT_DXT1_RGB =                                       401,
+   ISL_FORMAT_FXT1 =                                           402,
+   ISL_FORMAT_R8G8B8_UNORM =                                   403,
+   ISL_FORMAT_R8G8B8_SNORM =                                   404,
+   ISL_FORMAT_R8G8B8_SSCALED =                                 405,
+   ISL_FORMAT_R8G8B8_USCALED =                                 406,
+   ISL_FORMAT_R64G64B64A64_FLOAT =                             407,
+   ISL_FORMAT_R64G64B64_FLOAT =                                408,
+   ISL_FORMAT_BC4_SNORM =                                      409,
+   ISL_FORMAT_BC5_SNORM =                                      410,
+   ISL_FORMAT_R16G16B16_FLOAT =                                411,
+   ISL_FORMAT_R16G16B16_UNORM =                                412,
+   ISL_FORMAT_R16G16B16_SNORM =                                413,
+   ISL_FORMAT_R16G16B16_SSCALED =                              414,
+   ISL_FORMAT_R16G16B16_USCALED =                              415,
+   ISL_FORMAT_BC6H_SF16 =                                      417,
+   ISL_FORMAT_BC7_UNORM =                                      418,
+   ISL_FORMAT_BC7_UNORM_SRGB =                                 419,
+   ISL_FORMAT_BC6H_UF16 =                                      420,
+   ISL_FORMAT_PLANAR_420_8 =                                   421,
+   ISL_FORMAT_R8G8B8_UNORM_SRGB =                              424,
+   ISL_FORMAT_ETC1_RGB8 =                                      425,
+   ISL_FORMAT_ETC2_RGB8 =                                      426,
+   ISL_FORMAT_EAC_R11 =                                        427,
+   ISL_FORMAT_EAC_RG11 =                                       428,
+   ISL_FORMAT_EAC_SIGNED_R11 =                                 429,
+   ISL_FORMAT_EAC_SIGNED_RG11 =                                430,
+   ISL_FORMAT_ETC2_SRGB8 =                                     431,
+   ISL_FORMAT_R16G16B16_UINT =                                 432,
+   ISL_FORMAT_R16G16B16_SINT =                                 433,
+   ISL_FORMAT_R32_SFIXED =                                     434,
+   ISL_FORMAT_R10G10B10A2_SNORM =                              435,
+   ISL_FORMAT_R10G10B10A2_USCALED =                            436,
+   ISL_FORMAT_R10G10B10A2_SSCALED =                            437,
+   ISL_FORMAT_R10G10B10A2_SINT =                               438,
+   ISL_FORMAT_B10G10R10A2_SNORM =                              439,
+   ISL_FORMAT_B10G10R10A2_USCALED =                            440,
+   ISL_FORMAT_B10G10R10A2_SSCALED =                            441,
+   ISL_FORMAT_B10G10R10A2_UINT =                               442,
+   ISL_FORMAT_B10G10R10A2_SINT =                               443,
+   ISL_FORMAT_R64G64B64A64_PASSTHRU =                          444,
+   ISL_FORMAT_R64G64B64_PASSTHRU =                             445,
+   ISL_FORMAT_ETC2_RGB8_PTA =                                  448,
+   ISL_FORMAT_ETC2_SRGB8_PTA =                                 449,
+   ISL_FORMAT_ETC2_EAC_RGBA8 =                                 450,
+   ISL_FORMAT_ETC2_EAC_SRGB8_A8 =                              451,
+   ISL_FORMAT_R8G8B8_UINT =                                    456,
+   ISL_FORMAT_R8G8B8_SINT =                                    457,
+   ISL_FORMAT_RAW =                                            511,
+
+   /* Hardware doesn't understand this out-of-band value */
+   ISL_FORMAT_UNSUPPORTED =                             UINT16_MAX,
+};
+
+/**
+ * Numerical base type for channels of isl_format.
+ */
+enum isl_base_type {
+   ISL_VOID,
+   ISL_RAW,
+   ISL_UNORM,
+   ISL_SNORM,
+   ISL_UFLOAT,
+   ISL_SFLOAT,
+   ISL_UFIXED,
+   ISL_SFIXED,
+   ISL_UINT,
+   ISL_SINT,
+   ISL_USCALED,
+   ISL_SSCALED,
+};
+
+/**
+ * Colorspace of isl_format.
+ */
+enum isl_colorspace {
+   ISL_COLORSPACE_NONE = 0,
+   ISL_COLORSPACE_LINEAR,
+   ISL_COLORSPACE_SRGB,
+   ISL_COLORSPACE_YUV,
+};
+
+/**
+ * Texture compression mode of isl_format.
+ */
+enum isl_txc {
+   ISL_TXC_NONE = 0,
+   ISL_TXC_DXT1,
+   ISL_TXC_DXT3,
+   ISL_TXC_DXT5,
+   ISL_TXC_FXT1,
+   ISL_TXC_RGTC1,
+   ISL_TXC_RGTC2,
+   ISL_TXC_BPTC,
+   ISL_TXC_ETC1,
+   ISL_TXC_ETC2,
+};
+
+/**
+ * @brief Hardware tile mode
+ *
+ * WARNING: These values differ from the hardware enum values, which are
+ * unstable across hardware generations.
+ *
+ * Note that legacy Y tiling is ISL_TILING_Y0 instead of ISL_TILING_Y, to
+ * clearly distinguish it from Yf and Ys.
+ */
+enum isl_tiling {
+   ISL_TILING_LINEAR = 0,
+   ISL_TILING_W,
+   ISL_TILING_X,
+   ISL_TILING_Y0, /**< Legacy Y tiling */
+   ISL_TILING_Yf, /**< Standard 4K tiling. The 'f' means "four". */
+   ISL_TILING_Ys, /**< Standard 64K tiling. The 's' means "sixty-four". */
+};
+
+/**
+ * @defgroup Tiling Flags
+ * @{
+ */
+typedef uint32_t isl_tiling_flags_t;
+#define ISL_TILING_LINEAR_BIT             (1u << ISL_TILING_LINEAR)
+#define ISL_TILING_W_BIT                  (1u << ISL_TILING_W)
+#define ISL_TILING_X_BIT                  (1u << ISL_TILING_X)
+#define ISL_TILING_Y0_BIT                 (1u << ISL_TILING_Y0)
+#define ISL_TILING_Yf_BIT                 (1u << ISL_TILING_Yf)
+#define ISL_TILING_Ys_BIT                 (1u << ISL_TILING_Ys)
+#define ISL_TILING_ANY_MASK               (~0u)
+#define ISL_TILING_NON_LINEAR_MASK        (~ISL_TILING_LINEAR_BIT)
+
+/** Any Y tiling, including legacy Y tiling. */
+#define ISL_TILING_ANY_Y_MASK             (ISL_TILING_Y0_BIT | \
+                                           ISL_TILING_Yf_BIT | \
+                                           ISL_TILING_Ys_BIT)
+
+/** The Skylake BSpec refers to Yf and Ys as "standard tiling formats". */
+#define ISL_TILING_STD_Y_MASK             (ISL_TILING_Yf_BIT | \
+                                           ISL_TILING_Ys_BIT)
+/** @} */
+
+/**
+ * @brief Logical dimension of surface.
+ *
+ * Note: There is no dimension for cube map surfaces. ISL interprets cube maps
+ * as 2D array surfaces.
+ */
+enum isl_surf_dim {
+   ISL_SURF_DIM_1D,
+   ISL_SURF_DIM_2D,
+   ISL_SURF_DIM_3D,
+};
+
+/**
+ * @brief Physical layout of the surface's dimensions.
+ */
+enum isl_dim_layout {
+   /**
+    * For details, see the G35 PRM >> Volume 1: Graphics Core >> Section
+    * 6.17.3: 2D Surfaces.
+    *
+    * On many gens, 1D surfaces share the same layout as 2D surfaces.  From
+    * the G35 PRM >> Volume 1: Graphics Core >> Section 6.17.2: 1D Surfaces:
+    *
+    *    One-dimensional surfaces are identical to 2D surfaces with height of
+    *    one.
+    *
+    * @invariant isl_surf::phys_level0_sa::depth == 1
+    */
+   ISL_DIM_LAYOUT_GEN4_2D,
+
+   /**
+    * For details, see the G35 PRM >> Volume 1: Graphics Core >> Section
+    * 6.17.5: 3D Surfaces.
+    *
+    * @invariant isl_surf::phys_level0_sa::array_len == 1
+    */
+   ISL_DIM_LAYOUT_GEN4_3D,
+
+   /**
+    * For details, see the Skylake BSpec >> Memory Views >> Common Surface
+    * Formats >> Surface Layout and Tiling >> » 1D Surfaces.
+    */
+   ISL_DIM_LAYOUT_GEN9_1D,
+};
+
+/* TODO(chadv): Explain */
+enum isl_array_pitch_span {
+   ISL_ARRAY_PITCH_SPAN_FULL,
+   ISL_ARRAY_PITCH_SPAN_COMPACT,
+};
+
+/**
+ * @defgroup Surface Usage
+ * @{
+ */
+typedef uint64_t isl_surf_usage_flags_t;
+#define ISL_SURF_USAGE_RENDER_TARGET_BIT       (1u << 0)
+#define ISL_SURF_USAGE_DEPTH_BIT               (1u << 1)
+#define ISL_SURF_USAGE_STENCIL_BIT             (1u << 2)
+#define ISL_SURF_USAGE_TEXTURE_BIT             (1u << 3)
+#define ISL_SURF_USAGE_CUBE_BIT                (1u << 4)
+#define ISL_SURF_USAGE_DISABLE_AUX_BIT         (1u << 5)
+#define ISL_SURF_USAGE_DISPLAY_BIT             (1u << 6)
+#define ISL_SURF_USAGE_DISPLAY_ROTATE_90_BIT   (1u << 7)
+#define ISL_SURF_USAGE_DISPLAY_ROTATE_180_BIT  (1u << 8)
+#define ISL_SURF_USAGE_DISPLAY_ROTATE_270_BIT  (1u << 9)
+#define ISL_SURF_USAGE_DISPLAY_FLIP_X_BIT      (1u << 10)
+#define ISL_SURF_USAGE_DISPLAY_FLIP_Y_BIT      (1u << 11)
+/** @} */
+
+/**
+ * @brief Multisample Format
+ */
+enum isl_msaa_layout {
+   /**
+    * @brief Suface is single-sampled.
+    */
+   ISL_MSAA_LAYOUT_NONE,
+
+   /**
+    * @brief [SNB+] Interleaved Multisample Format
+    *
+    * In this format, multiple samples are interleaved into each cacheline.
+    * In other words, the sample index is swizzled into the low 6 bits of the
+    * surface's virtual address space.
+    *
+    * For example, suppose the surface is legacy Y tiled, is 4x multisampled,
+    * and its pixel format is 32bpp. Then the first cacheline is arranged
+    * thus:
+    *
+    *    (0,0,0) (0,1,0)   (0,0,1) (1,0,1)
+    *    (1,0,0) (1,1,0)   (0,1,1) (1,1,1)
+    *
+    *    (0,0,2) (1,0,2)   (0,0,3) (1,0,3)
+    *    (0,1,2) (1,1,2)   (0,1,3) (1,1,3)
+    *
+    * The hardware docs refer to this format with multiple terms.  In
+    * Sandybridge, this is the only multisample format; so no term is used.
+    * The Ivybridge docs refer to surfaces in this format as IMS (Interleaved
+    * Multisample Surface). Later hardware docs additionally refer to this
+    * format as MSFMT_DEPTH_STENCIL (because the format is deprecated for
+    * color surfaces).
+    *
+    * See the Sandybridge PRM, Volume 4, Part 1, Section 2.7 "Multisampled
+    * Surface Behavior".
+    *
+    * See the Ivybridge PRM, Volume 1, Part 1, Section 6.18.4.1 "Interleaved
+    * Multisampled Surfaces".
+    */
+   ISL_MSAA_LAYOUT_INTERLEAVED,
+
+   /**
+    * @brief [IVB+] Array Multisample Format
+    *
+    * In this format, the surface's physical layout resembles that of a
+    * 2D array surface.
+    *
+    * Suppose the multisample surface's logical extent is (w, h) and its
+    * sample count is N. Then surface's physical extent is the same as
+    * a singlesample 2D surface whose logical extent is (w, h) and array
+    * length is N.  Array slice `i` contains the pixel values for sample
+    * index `i`.
+    *
+    * The Ivybridge docs refer to surfaces in this format as UMS
+    * (Uncompressed Multsample Layout) and CMS (Compressed Multisample
+    * Surface). The Broadwell docs additionally refer to this format as
+    * MSFMT_MSS (MSS=Multisample Surface Storage).
+    *
+    * See the Broadwell PRM, Volume 5 "Memory Views", Section "Uncompressed
+    * Multisample Surfaces".
+    *
+    * See the Broadwell PRM, Volume 5 "Memory Views", Section "Compressed
+    * Multisample Surfaces".
+    */
+   ISL_MSAA_LAYOUT_ARRAY,
+};
+
+
+struct isl_device {
+   const struct brw_device_info *info;
+   bool use_separate_stencil;
+   bool has_bit6_swizzling;
+};
+
+struct isl_extent2d {
+   union { uint32_t w, width; };
+   union { uint32_t h, height; };
+};
+
+struct isl_extent3d {
+   union { uint32_t w, width; };
+   union { uint32_t h, height; };
+   union { uint32_t d, depth; };
+};
+
+struct isl_extent4d {
+   union { uint32_t w, width; };
+   union { uint32_t h, height; };
+   union { uint32_t d, depth; };
+   union { uint32_t a, array_len; };
+};
+
+struct isl_channel_layout {
+   enum isl_base_type type;
+   uint8_t bits; /**< Size in bits */
+};
+
+/**
+ *   Each format has 3D block extent (width, height, depth). The block extent
+ *   of compressed formats is that of the format's compression block. For
+ *   example, the block extent of ISL_FORMAT_ETC2_RGB8 is (w=4, h=4, d=1).
+ *   The block extent of uncompressed pixel formats, such as
+ *   ISL_FORMAT_R8G8B8A8_UNORM, is is (w=1, h=1, d=1).
+ */
+struct isl_format_layout {
+   enum isl_format format;
+
+   uint8_t bs; /**< Block size, in bytes, rounded towards 0 */
+   uint8_t bw; /**< Block width, in pixels */
+   uint8_t bh; /**< Block height, in pixels */
+   uint8_t bd; /**< Block depth, in pixels */
+
+   struct {
+      struct isl_channel_layout r; /**< Red channel */
+      struct isl_channel_layout g; /**< Green channel */
+      struct isl_channel_layout b; /**< Blue channel */
+      struct isl_channel_layout a; /**< Alpha channel */
+      struct isl_channel_layout l; /**< Luminance channel */
+      struct isl_channel_layout i; /**< Intensity channel */
+      struct isl_channel_layout p; /**< Palette channel */
+   } channels;
+
+   enum isl_colorspace colorspace;
+   enum isl_txc txc;
+};
+
+struct isl_tile_info {
+   enum isl_tiling tiling;
+   uint32_t width; /**< in bytes */
+   uint32_t height; /**< in rows of memory */
+   uint32_t size; /**< in bytes */
+};
+
+/**
+ * @brief Input to surface initialization
+ *
+ * @invariant width >= 1
+ * @invariant height >= 1
+ * @invariant depth >= 1
+ * @invariant levels >= 1
+ * @invariant samples >= 1
+ * @invariant array_len >= 1
+ *
+ * @invariant if 1D then height == 1 and depth == 1 and samples == 1
+ * @invariant if 2D then depth == 1
+ * @invariant if 3D then array_len == 1 and samples == 1
+ */
+struct isl_surf_init_info {
+   enum isl_surf_dim dim;
+   enum isl_format format;
+
+   uint32_t width;
+   uint32_t height;
+   uint32_t depth;
+   uint32_t levels;
+   uint32_t array_len;
+   uint32_t samples;
+
+   /** Lower bound for isl_surf::alignment, in bytes. */
+   uint32_t min_alignment;
+
+   /** Lower bound for isl_surf::pitch, in bytes. */
+   uint32_t min_pitch;
+
+   isl_surf_usage_flags_t usage;
+
+   /** Flags that alter how ISL selects isl_surf::tiling.  */
+   isl_tiling_flags_t tiling_flags;
+};
+
+struct isl_surf {
+   enum isl_surf_dim dim;
+   enum isl_dim_layout dim_layout;
+   enum isl_msaa_layout msaa_layout;
+   enum isl_tiling tiling;
+   enum isl_format format;
+
+   /**
+    * Alignment of the upper-left sample of each subimage, in units of surface
+    * elements.
+    */
+   struct isl_extent3d image_alignment_el;
+
+   /**
+    * Logical extent of the surface's base level, in units of pixels.  This is
+    * identical to the extent defined in isl_surf_init_info.
+    */
+   struct isl_extent4d logical_level0_px;
+
+   /**
+    * Physical extent of the surface's base level, in units of physical
+    * surface samples and aligned to the format's compression block.
+    *
+    * Consider isl_dim_layout as an operator that transforms a logical surface
+    * layout to a physical surface layout. Then
+    *
+    *    logical_layout := (isl_surf::dim, isl_surf::logical_level0_px)
+    *    isl_surf::phys_level0_sa := isl_surf::dim_layout * logical_layout
+    */
+   struct isl_extent4d phys_level0_sa;
+
+   uint32_t levels;
+   uint32_t samples;
+
+   /** Total size of the surface, in bytes. */
+   uint32_t size;
+
+   /** Required alignment for the surface's base address. */
+   uint32_t alignment;
+
+   /**
+    * Pitch between vertically adjacent surface elements, in bytes.
+    */
+   uint32_t row_pitch;
+
+   /**
+    * Pitch between physical array slices, in rows of surface elements.
+    */
+   uint32_t array_pitch_el_rows;
+
+   enum isl_array_pitch_span array_pitch_span;
+
+   /** Copy of isl_surf_init_info::usage. */
+   isl_surf_usage_flags_t usage;
+};
+
+extern const struct isl_format_layout isl_format_layouts[];
+
+void
+isl_device_init(struct isl_device *dev,
+                const struct brw_device_info *info,
+                bool has_bit6_swizzling);
+
+static inline const struct isl_format_layout * ATTRIBUTE_CONST
+isl_format_get_layout(enum isl_format fmt)
+{
+   return &isl_format_layouts[fmt];
+}
+
+bool
+isl_format_has_sint_channel(enum isl_format fmt) ATTRIBUTE_CONST;
+
+static inline bool
+isl_format_is_compressed(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return fmtl->txc != ISL_TXC_NONE;
+}
+
+static inline bool
+isl_format_has_bc_compression(enum isl_format fmt)
+{
+   switch (isl_format_get_layout(fmt)->txc) {
+   case ISL_TXC_DXT1:
+   case ISL_TXC_DXT3:
+   case ISL_TXC_DXT5:
+      return true;
+   case ISL_TXC_NONE:
+   case ISL_TXC_FXT1:
+   case ISL_TXC_RGTC1:
+   case ISL_TXC_RGTC2:
+   case ISL_TXC_BPTC:
+   case ISL_TXC_ETC1:
+   case ISL_TXC_ETC2:
+      return false;
+   }
+
+   unreachable("bad texture compression mode");
+   return false;
+}
+
+static inline bool
+isl_format_is_yuv(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return fmtl->colorspace == ISL_COLORSPACE_YUV;
+}
+
+static inline bool
+isl_format_block_is_1x1x1(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return fmtl->bw == 1 && fmtl->bh == 1 && fmtl->bd == 1;
+}
+
+static inline bool
+isl_format_is_rgb(enum isl_format fmt)
+{
+   return isl_format_layouts[fmt].channels.r.bits > 0 &&
+          isl_format_layouts[fmt].channels.g.bits > 0 &&
+          isl_format_layouts[fmt].channels.b.bits > 0 &&
+          isl_format_layouts[fmt].channels.a.bits == 0;
+}
+
+enum isl_format isl_format_rgb_to_rgba(enum isl_format rgb) ATTRIBUTE_CONST;
+enum isl_format isl_format_rgb_to_rgbx(enum isl_format rgb) ATTRIBUTE_CONST;
+
+bool isl_is_storage_image_format(enum isl_format fmt);
+
+enum isl_format
+isl_lower_storage_image_format(const struct isl_device *dev,
+                               enum isl_format fmt);
+
+static inline bool
+isl_tiling_is_std_y(enum isl_tiling tiling)
+{
+   return (1u << tiling) & ISL_TILING_STD_Y_MASK;
+}
+
+bool
+isl_tiling_get_info(const struct isl_device *dev,
+                    enum isl_tiling tiling,
+                    uint32_t format_block_size,
+                    struct isl_tile_info *info);
+
+void
+isl_tiling_get_extent(const struct isl_device *dev,
+                      enum isl_tiling tiling,
+                      uint32_t format_block_size,
+                      struct isl_extent2d *e);
+bool
+isl_surf_choose_tiling(const struct isl_device *dev,
+                       const struct isl_surf_init_info *restrict info,
+                       enum isl_tiling *tiling);
+
+static inline bool
+isl_surf_usage_is_display(isl_surf_usage_flags_t usage)
+{
+   return usage & ISL_SURF_USAGE_DISPLAY_BIT;
+}
+
+static inline bool
+isl_surf_usage_is_depth(isl_surf_usage_flags_t usage)
+{
+   return usage & ISL_SURF_USAGE_DEPTH_BIT;
+}
+
+static inline bool
+isl_surf_usage_is_stencil(isl_surf_usage_flags_t usage)
+{
+   return usage & ISL_SURF_USAGE_STENCIL_BIT;
+}
+
+static inline bool
+isl_surf_usage_is_depth_and_stencil(isl_surf_usage_flags_t usage)
+{
+   return (usage & ISL_SURF_USAGE_DEPTH_BIT) &&
+          (usage & ISL_SURF_USAGE_STENCIL_BIT);
+}
+
+static inline bool
+isl_surf_usage_is_depth_or_stencil(isl_surf_usage_flags_t usage)
+{
+   return usage & (ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_STENCIL_BIT);
+}
+
+static inline bool
+isl_surf_info_is_z16(const struct isl_surf_init_info *info)
+{
+   return (info->usage & ISL_SURF_USAGE_DEPTH_BIT) &&
+          (info->format == ISL_FORMAT_R16_UNORM);
+}
+
+static inline bool
+isl_surf_info_is_z32_float(const struct isl_surf_init_info *info)
+{
+   return (info->usage & ISL_SURF_USAGE_DEPTH_BIT) &&
+          (info->format == ISL_FORMAT_R32_FLOAT);
+}
+
+static inline struct isl_extent2d
+isl_extent2d(uint32_t width, uint32_t height)
+{
+   return (struct isl_extent2d) { .w = width, .h = height };
+}
+
+static inline struct isl_extent3d
+isl_extent3d(uint32_t width, uint32_t height, uint32_t depth)
+{
+   return (struct isl_extent3d) { .w = width, .h = height, .d = depth };
+}
+
+static inline struct isl_extent4d
+isl_extent4d(uint32_t width, uint32_t height, uint32_t depth,
+             uint32_t array_len)
+{
+   return (struct isl_extent4d) {
+      .w = width,
+      .h = height,
+      .d = depth,
+      .a = array_len,
+   };
+}
+
+#define isl_surf_init(dev, surf, ...) \
+   isl_surf_init_s((dev), (surf), \
+                   &(struct isl_surf_init_info) {  __VA_ARGS__ });
+
+bool
+isl_surf_init_s(const struct isl_device *dev,
+                struct isl_surf *surf,
+                const struct isl_surf_init_info *restrict info);
+
+/**
+ * Alignment of the upper-left sample of each subimage, in units of surface
+ * elements.
+ */
+static inline struct isl_extent3d
+isl_surf_get_image_alignment_el(const struct isl_surf *surf)
+{
+   return surf->image_alignment_el;
+}
+
+/**
+ * Alignment of the upper-left sample of each subimage, in units of surface
+ * samples.
+ */
+static inline struct isl_extent3d
+isl_surf_get_image_alignment_sa(const struct isl_surf *surf)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+
+   return (struct isl_extent3d) {
+      .w = fmtl->bw * surf->image_alignment_el.w,
+      .h = fmtl->bh * surf->image_alignment_el.h,
+      .d = fmtl->bd * surf->image_alignment_el.d,
+   };
+}
+
+/**
+ * Pitch between vertically adjacent surface elements, in bytes.
+ */
+static inline uint32_t
+isl_surf_get_row_pitch(const struct isl_surf *surf)
+{
+   return surf->row_pitch;
+}
+
+/**
+ * Pitch between vertically adjacent surface elements, in units of surface elements.
+ */
+static inline uint32_t
+isl_surf_get_row_pitch_el(const struct isl_surf *surf)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+
+   assert(surf->row_pitch % fmtl->bs == 0);
+   return surf->row_pitch / fmtl->bs;
+}
+
+/**
+ * Pitch between physical array slices, in rows of surface elements.
+ */
+static inline uint32_t
+isl_surf_get_array_pitch_el_rows(const struct isl_surf *surf)
+{
+   return surf->array_pitch_el_rows;
+}
+
+/**
+ * Pitch between physical array slices, in units of surface elements.
+ */
+static inline uint32_t
+isl_surf_get_array_pitch_el(const struct isl_surf *surf)
+{
+   return isl_surf_get_array_pitch_el_rows(surf) *
+          isl_surf_get_row_pitch_el(surf);
+}
+
+/**
+ * Pitch between physical array slices, in rows of surface samples.
+ */
+static inline uint32_t
+isl_surf_get_array_pitch_sa_rows(const struct isl_surf *surf)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+   return fmtl->bh * isl_surf_get_array_pitch_el_rows(surf);
+}
+
+/**
+ * Pitch between physical array slices, in bytes.
+ */
+static inline uint32_t
+isl_surf_get_array_pitch(const struct isl_surf *surf)
+{
+   return isl_surf_get_array_pitch_sa_rows(surf) * surf->row_pitch;
+}
+
+/**
+ * Get the offset to an subimage within the surface, in units of surface
+ * samples.
+ *
+ * @invariant level < surface levels
+ * @invariant logical_array_layer < logical array length of surface
+ * @invariant logical_z_offset_px < logical depth of surface at level
+ */
+void
+isl_surf_get_image_offset_sa(const struct isl_surf *surf,
+                             uint32_t level,
+                             uint32_t logical_array_layer,
+                             uint32_t logical_z_offset_px,
+                             uint32_t *x_offset_sa,
+                             uint32_t *y_offset_sa);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/isl/isl_format.c b/src/isl/isl_format.c

new file mode 100644 (file)

index 0000000..0fe6e9b
--- /dev/null
+++ b/src/isl/isl_format.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+
+#include "isl.h"
+
+bool
+isl_format_has_sint_channel(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return fmtl->channels.r.type == ISL_SINT ||
+          fmtl->channels.g.type == ISL_SINT ||
+          fmtl->channels.b.type == ISL_SINT ||
+          fmtl->channels.a.type == ISL_SINT ||
+          fmtl->channels.l.type == ISL_SINT ||
+          fmtl->channels.i.type == ISL_SINT ||
+          fmtl->channels.p.type == ISL_SINT ||
+          fmtl->channels.g.type == ISL_SINT;
+}
+
+enum isl_format
+isl_format_rgb_to_rgba(enum isl_format rgb)
+{
+   assert(isl_format_is_rgb(rgb));
+
+   switch (rgb) {
+   case ISL_FORMAT_R32G32B32_FLOAT:    return ISL_FORMAT_R32G32B32A32_FLOAT;
+   case ISL_FORMAT_R32G32B32_SINT:     return ISL_FORMAT_R32G32B32A32_SINT;
+   case ISL_FORMAT_R32G32B32_UINT:     return ISL_FORMAT_R32G32B32A32_UINT;
+   case ISL_FORMAT_R32G32B32_UNORM:    return ISL_FORMAT_R32G32B32A32_UNORM;
+   case ISL_FORMAT_R32G32B32_SNORM:    return ISL_FORMAT_R32G32B32A32_SNORM;
+   case ISL_FORMAT_R32G32B32_SSCALED:  return ISL_FORMAT_R32G32B32A32_SSCALED;
+   case ISL_FORMAT_R32G32B32_USCALED:  return ISL_FORMAT_R32G32B32A32_USCALED;
+   case ISL_FORMAT_R32G32B32_SFIXED:   return ISL_FORMAT_R32G32B32A32_SFIXED;
+   case ISL_FORMAT_R8G8B8_UNORM:       return ISL_FORMAT_R8G8B8A8_UNORM;
+   case ISL_FORMAT_R8G8B8_SNORM:       return ISL_FORMAT_R8G8B8A8_SNORM;
+   case ISL_FORMAT_R8G8B8_SSCALED:     return ISL_FORMAT_R8G8B8A8_SSCALED;
+   case ISL_FORMAT_R8G8B8_USCALED:     return ISL_FORMAT_R8G8B8A8_USCALED;
+   case ISL_FORMAT_R16G16B16_FLOAT:    return ISL_FORMAT_R16G16B16A16_FLOAT;
+   case ISL_FORMAT_R16G16B16_UNORM:    return ISL_FORMAT_R16G16B16A16_UNORM;
+   case ISL_FORMAT_R16G16B16_SNORM:    return ISL_FORMAT_R16G16B16A16_SNORM;
+   case ISL_FORMAT_R16G16B16_SSCALED:  return ISL_FORMAT_R16G16B16A16_SSCALED;
+   case ISL_FORMAT_R16G16B16_USCALED:  return ISL_FORMAT_R16G16B16A16_USCALED;
+   case ISL_FORMAT_R8G8B8_UNORM_SRGB:  return ISL_FORMAT_R8G8B8A8_UNORM_SRGB;
+   case ISL_FORMAT_R16G16B16_UINT:     return ISL_FORMAT_R16G16B16A16_UINT;
+   case ISL_FORMAT_R16G16B16_SINT:     return ISL_FORMAT_R16G16B16A16_SINT;
+   case ISL_FORMAT_R8G8B8_UINT:        return ISL_FORMAT_R8G8B8A8_UINT;
+   case ISL_FORMAT_R8G8B8_SINT:        return ISL_FORMAT_R8G8B8A8_SINT;
+   default:
+      return ISL_FORMAT_UNSUPPORTED;
+   }
+}
+
+enum isl_format
+isl_format_rgb_to_rgbx(enum isl_format rgb)
+{
+   assert(isl_format_is_rgb(rgb));
+
+   switch (rgb) {
+   case ISL_FORMAT_R32G32B32_FLOAT:
+      return ISL_FORMAT_R32G32B32X32_FLOAT;
+   case ISL_FORMAT_R16G16B16_UNORM:
+      return ISL_FORMAT_R16G16B16X16_UNORM;
+   case ISL_FORMAT_R16G16B16_FLOAT:
+      return ISL_FORMAT_R16G16B16X16_FLOAT;
+   case ISL_FORMAT_R8G8B8_UNORM:
+      return ISL_FORMAT_R8G8B8X8_UNORM;
+   case ISL_FORMAT_R8G8B8_UNORM_SRGB:
+      return ISL_FORMAT_R8G8B8X8_UNORM_SRGB;
+   default:
+      return ISL_FORMAT_UNSUPPORTED;
+   }
+}
diff --git a/src/isl/isl_format_layout.csv b/src/isl/isl_format_layout.csv

new file mode 100644 (file)

index 0000000..2a302b0
--- /dev/null
+++ b/src/isl/isl_format_layout.csv
@@ -0,0 +1,287 @@
+# Copyright 2015 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+#
+# @file
+# @brief Layout of all hardware surface formats
+#
+# For the official list, see Broadwell PRM: Volume 2b: Command Reference:
+# Enumerations: SURFACE_FORMAT.
+#
+
+
+# Columns:
+#    name: format name in PRM
+#    bpb: bits per block
+#    bw: block width, in pixels
+#    bh: block height, in pixels
+#    bd: block depth, in pixels
+#    r: red channel, data type and bitwidth
+#    g: green channel
+#    b: blue channel
+#    a: alpha channel
+#    l: luminance channel
+#    i: intensity channel
+#    p: palette channel
+#    space: colorspace
+#    txc: texture compression
+#
+# Data Types:
+#     x: void
+#     r: raw
+#    un: unorm
+#    sn: snorm
+#    uf: ufloat
+#    sf: sfloat
+#    ux: ufixed
+#    sx: sfixed
+#    ui: uint
+#    si: sint
+#    us: uscaled
+#    ss: sscaled
+
+
+# Table is aligned with the Vim commands below, using the Align plugin:
+#     :AlignCtrl lr+ p8000000000000P1
+#     /^# name/,$ Align,
+
+# name                      , bpb, bw, bh, bd,    r,    g,    b,    a,    l,    i,   p,  space,   txc
+R32G32B32A32_FLOAT          , 128,  1,  1,  1, sf32, sf32, sf32, sf32,     ,     ,    , linear,
+R32G32B32A32_SINT           , 128,  1,  1,  1, si32, si32, si32, si32,     ,     ,    , linear,
+R32G32B32A32_UINT           , 128,  1,  1,  1, ui32, ui32, ui32, ui32,     ,     ,    , linear,
+R32G32B32A32_UNORM          , 128,  1,  1,  1, un32, un32, un32, un32,     ,     ,    , linear,
+R32G32B32A32_SNORM          , 128,  1,  1,  1, sn32, sn32, sn32, sn32,     ,     ,    , linear,
+R64G64_FLOAT                , 128,  1,  1,  1, sf64, sf64,     ,     ,     ,     ,    , linear,
+R32G32B32X32_FLOAT          , 128,  1,  1,  1, sf32, sf32, sf32,  x32,     ,     ,    , linear,
+R32G32B32A32_SSCALED        , 128,  1,  1,  1, ss32, ss32, ss32, ss32,     ,     ,    , linear,
+R32G32B32A32_USCALED        , 128,  1,  1,  1, us32, us32, us32, us32,     ,     ,    , linear,
+R32G32B32A32_SFIXED         , 128,  1,  1,  1, sx32, sx32, sx32, sx32,     ,     ,    , linear,
+R64G64_PASSTHRU             , 128,  1,  1,  1,  r64,  r64,     ,     ,     ,     ,    ,       ,
+R32G32B32_FLOAT             ,  96,  1,  1,  1, sf32, sf32, sf32,     ,     ,     ,    , linear,
+R32G32B32_SINT              ,  96,  1,  1,  1, si32, si32, si32,     ,     ,     ,    , linear,
+R32G32B32_UINT              ,  96,  1,  1,  1, ui32, ui32, ui32,     ,     ,     ,    , linear,
+R32G32B32_UNORM             ,  96,  1,  1,  1, un32, un32, un32,     ,     ,     ,    , linear,
+R32G32B32_SNORM             ,  96,  1,  1,  1, sn32, sn32, sn32,     ,     ,     ,    , linear,
+R32G32B32_SSCALED           ,  96,  1,  1,  1, ss32, ss32, ss32,     ,     ,     ,    , linear,
+R32G32B32_USCALED           ,  96,  1,  1,  1, us32, us32, us32,     ,     ,     ,    , linear,
+R32G32B32_SFIXED            ,  96,  1,  1,  1, sx32, sx32, sx32,     ,     ,     ,    , linear,
+R16G16B16A16_UNORM          ,  64,  1,  1,  1, un16, un16, un16, un16,     ,     ,    , linear,
+R16G16B16A16_SNORM          ,  64,  1,  1,  1, sn16, sn16, sn16, sn16,     ,     ,    , linear,
+R16G16B16A16_SINT           ,  64,  1,  1,  1, si16, si16, si16, si16,     ,     ,    , linear,
+R16G16B16A16_UINT           ,  64,  1,  1,  1, ui16, ui16, ui16, ui16,     ,     ,    , linear,
+R16G16B16A16_FLOAT          ,  64,  1,  1,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,
+R32G32_FLOAT                ,  64,  1,  1,  1, sf32, sf32,     ,     ,     ,     ,    , linear,
+R32G32_SINT                 ,  64,  1,  1,  1, si32, si32,     ,     ,     ,     ,    , linear,
+R32G32_UINT                 ,  64,  1,  1,  1, ui32, ui32,     ,     ,     ,     ,    , linear,
+R32_FLOAT_X8X24_TYPELESS    ,  64,  1,  1,  1, sf32,   x8,  x24,     ,     ,     ,    , linear,
+X32_TYPELESS_G8X24_UINT     ,  64,  1,  1,  1,  x32,  ui8,  x24,     ,     ,     ,    , linear,
+L32A32_FLOAT                ,  64,  1,  1,  1,     ,     ,     , sf32, sf32,     ,    , linear,
+R32G32_UNORM                ,  64,  1,  1,  1, un32, un32,     ,     ,     ,     ,    , linear,
+R32G32_SNORM                ,  64,  1,  1,  1, sn32, sn32,     ,     ,     ,     ,    , linear,
+R64_FLOAT                   ,  64,  1,  1,  1, sf64,     ,     ,     ,     ,     ,    , linear,
+R16G16B16X16_UNORM          ,  64,  1,  1,  1, un16, un16, un16,  x16,     ,     ,    , linear,
+R16G16B16X16_FLOAT          ,  64,  1,  1,  1, sf16, sf16, sf16,  x16,     ,     ,    , linear,
+A32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     , sf32,  x32,     ,    ,  alpha,
+L32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     ,  x32, sf32,     ,    , linear,
+I32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     ,  x32,     , sf32,    , linear,
+R16G16B16A16_SSCALED        ,  64,  1,  1,  1, ss16, ss16, ss16, ss16,     ,     ,    , linear,
+R16G16B16A16_USCALED        ,  64,  1,  1,  1, us16, us16, us16, us16,     ,     ,    , linear,
+R32G32_SSCALED              ,  64,  1,  1,  1, ss32, ss32,     ,     ,     ,     ,    , linear,
+R32G32_USCALED              ,  64,  1,  1,  1, us32, us32,     ,     ,     ,     ,    , linear,
+R32G32_SFIXED               ,  64,  1,  1,  1, sx32, sx32,     ,     ,     ,     ,    , linear,
+R64_PASSTHRU                ,  64,  1,  1,  1,  r64,     ,     ,     ,     ,     ,    ,       ,
+B8G8R8A8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    , linear,
+B8G8R8A8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,
+R10G10B10A2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    , linear,
+R10G10B10A2_UNORM_SRGB      ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    ,   srgb,
+R10G10B10A2_UINT            ,  32,  1,  1,  1, ui10, ui10, ui10,  ui2,     ,     ,    , linear,
+R10G10B10_SNORM_A2_UNORM    ,  32,  1,  1,  1, sn10, sn10, sn10,  un2,     ,     ,    , linear,
+R8G8B8A8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    , linear,
+R8G8B8A8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,
+R8G8B8A8_SNORM              ,  32,  1,  1,  1,  sn8,  sn8,  sn8,  sn8,     ,     ,    , linear,
+R8G8B8A8_SINT               ,  32,  1,  1,  1,  si8,  si8,  si8,  si8,     ,     ,    , linear,
+R8G8B8A8_UINT               ,  32,  1,  1,  1,  ui8,  ui8,  ui8,  ui8,     ,     ,    , linear,
+R16G16_UNORM                ,  32,  1,  1,  1, un16, un16,     ,     ,     ,     ,    , linear,
+R16G16_SNORM                ,  32,  1,  1,  1, sn16, sn16,     ,     ,     ,     ,    , linear,
+R16G16_SINT                 ,  32,  1,  1,  1, si16, si16,     ,     ,     ,     ,    , linear,
+R16G16_UINT                 ,  32,  1,  1,  1, ui16, ui16,     ,     ,     ,     ,    , linear,
+R16G16_FLOAT                ,  32,  1,  1,  1, sf16, sf16,     ,     ,     ,     ,    , linear,
+B10G10R10A2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    , linear,
+B10G10R10A2_UNORM_SRGB      ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    ,   srgb,
+R11G11B10_FLOAT             ,  32,  1,  1,  1, sf11, sf11, sf10,     ,     ,     ,    , linear,
+R32_SINT                    ,  32,  1,  1,  1, si32,     ,     ,     ,     ,     ,    , linear,
+R32_UINT                    ,  32,  1,  1,  1, ui32,     ,     ,     ,     ,     ,    , linear,
+R32_FLOAT                   ,  32,  1,  1,  1, sf32,     ,     ,     ,     ,     ,    , linear,
+R24_UNORM_X8_TYPELESS       ,  32,  1,  1,  1, un24,   x8,     ,     ,     ,     ,    , linear,
+X24_TYPELESS_G8_UINT        ,  32,  1,  1,  1,  x24,  ui8,     ,     ,     ,     ,    , linear,
+L32_UNORM                   ,  32,  1,  1,  1,     ,     ,     ,     , un32,     ,    , linear,
+A32_UNORM                   ,  32,  1,  1,  1,     ,     ,     , un32,     ,     ,    ,  alpha,
+L16A16_UNORM                ,  32,  1,  1,  1,     ,     ,     , un16, un16,     ,    , linear,
+I24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     ,   x8,     , un24,    , linear,
+L24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     ,   x8, un24,     ,    , linear,
+A24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     , un24,   x8,     ,    ,  alpha,
+I32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     ,     ,     , sf32,    , linear,
+L32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     ,     , sf32,     ,    , linear,
+A32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     , sf32,     ,     ,    ,  alpha,
+X8B8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,  un8,   x8,     ,     ,    , linear,
+A8X8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,   x8,  un8,     ,     ,    , linear,
+B8X8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,  un8,   x8,     ,     ,    , linear,
+B8G8R8X8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    , linear,
+B8G8R8X8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    ,   srgb,
+R8G8B8X8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    , linear,
+R8G8B8X8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    ,   srgb,
+R9G9B9E5_SHAREDEXP          ,  32,  1,  1,  1,  ui9,  ui9,  ui9,     ,     ,     ,    , linear,
+B10G10R10X2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,   x2,     ,     ,    , linear,
+L16A16_FLOAT                ,  32,  1,  1,  1,     ,     ,     , sf16, sf16,     ,    , linear,
+R32_UNORM                   ,  32,  1,  1,  1, un32,     ,     ,     ,     ,     ,    , linear,
+R32_SNORM                   ,  32,  1,  1,  1, sn32,     ,     ,     ,     ,     ,    , linear,
+R10G10B10X2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,   x2,     ,     ,    , linear,
+R8G8B8A8_SSCALED            ,  32,  1,  1,  1,  ss8,  ss8,  ss8,  ss8,     ,     ,    , linear,
+R8G8B8A8_USCALED            ,  32,  1,  1,  1,  us8,  us8,  us8,  us8,     ,     ,    , linear,
+R16G16_SSCALED              ,  32,  1,  1,  1, ss16,  ss6,     ,     ,     ,     ,    , linear,
+R16G16_USCALED              ,  32,  1,  1,  1, us16, us16,     ,     ,     ,     ,    , linear,
+R32_SSCALED                 ,  32,  1,  1,  1, ss32,     ,     ,     ,     ,     ,    , linear,
+R32_USCALED                 ,  32,  1,  1,  1, us32,     ,     ,     ,     ,     ,    , linear,
+B5G6R5_UNORM                ,  16,  1,  1,  1,  un5,  un6,  un5,     ,     ,     ,    , linear,
+B5G6R5_UNORM_SRGB           ,  16,  1,  1,  1,  un5,  un6,  un5,     ,     ,     ,    ,   srgb,
+B5G5R5A1_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    , linear,
+B5G5R5A1_UNORM_SRGB         ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    ,   srgb,
+B4G4R4A4_UNORM              ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,
+B4G4R4A4_UNORM_SRGB         ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    ,   srgb,
+R8G8_UNORM                  ,  16,  1,  1,  1,  un8,  un8,     ,     ,     ,     ,    , linear,
+R8G8_SNORM                  ,  16,  1,  1,  1,  sn8,  sn8,     ,     ,     ,     ,    , linear,
+R8G8_SINT                   ,  16,  1,  1,  1,  si8,  si8,     ,     ,     ,     ,    , linear,
+R8G8_UINT                   ,  16,  1,  1,  1,  ui8,  ui8,     ,     ,     ,     ,    , linear,
+R16_UNORM                   ,  16,  1,  1,  1, un16,     ,     ,     ,     ,     ,    , linear,
+R16_SNORM                   ,  16,  1,  1,  1, sn16,     ,     ,     ,     ,     ,    , linear,
+R16_SINT                    ,  16,  1,  1,  1, si16,     ,     ,     ,     ,     ,    , linear,
+R16_UINT                    ,  16,  1,  1,  1, ui16,     ,     ,     ,     ,     ,    , linear,
+R16_FLOAT                   ,  16,  1,  1,  1, sf16,     ,     ,     ,     ,     ,    , linear,
+A8P8_UNORM_PALETTE0         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8, linear,
+A8P8_UNORM_PALETTE1         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8, linear,
+I16_UNORM                   ,  16,  1,  1,  1,     ,     ,     ,     ,     , un16,    , linear,
+L16_UNORM                   ,  16,  1,  1,  1,     ,     ,     ,     , un16,     ,    , linear,
+A16_UNORM                   ,  16,  1,  1,  1,     ,     ,     , un16,     ,     ,    ,  alpha,
+L8A8_UNORM                  ,  16,  1,  1,  1,     ,     ,     ,  un8,  un8,     ,    , linear,
+I16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     ,     ,     , sf16,    , linear,
+L16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     ,     , sf16,     ,    , linear,
+A16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     , sf16,     ,     ,    ,  alpha,
+L8A8_UNORM_SRGB             ,  16,  1,  1,  1,     ,     ,     ,  un8,  un8,     ,    ,   srgb,
+R5G5_SNORM_B6_UNORM         ,  16,  1,  1,  1,  sn5,  sn5,  un6,     ,     ,     ,    , linear,
+B5G5R5X1_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,   x1,     ,     ,    , linear,
+B5G5R5X1_UNORM_SRGB         ,  16,  1,  1,  1,  un5,  un5,  un5,   x1,     ,     ,    ,   srgb,
+R8G8_SSCALED                ,  16,  1,  1,  1,  ss8,  ss8,     ,     ,     ,     ,    , linear,
+R8G8_USCALED                ,  16,  1,  1,  1,  us8,  us8,     ,     ,     ,     ,    , linear,
+R16_SSCALED                 ,  16,  1,  1,  1, ss16,     ,     ,     ,     ,     ,    , linear,
+R16_USCALED                 ,  16,  1,  1,  1, us16,     ,     ,     ,     ,     ,    , linear,
+P8A8_UNORM_PALETTE0         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8, linear,
+P8A8_UNORM_PALETTE1         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8, linear,
+A1B5G5R5_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    , linear,
+A4B4G4R4_UNORM              ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,
+L8A8_UINT                   ,  16,  1,  1,  1,     ,     ,     ,  ui8,  ui8,     ,    , linear,
+L8A8_SINT                   ,  16,  1,  1,  1,     ,     ,     ,  si8,  si8,     ,    , linear,
+R8_UNORM                    ,   8,  1,  1,  1,  un8,     ,     ,     ,     ,     ,    , linear,
+R8_SNORM                    ,   8,  1,  1,  1,  sn8,     ,     ,     ,     ,     ,    , linear,
+R8_SINT                     ,   8,  1,  1,  1,  si8,     ,     ,     ,     ,     ,    , linear,
+R8_UINT                     ,   8,  1,  1,  1,  ui8,     ,     ,     ,     ,     ,    , linear,
+A8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,  un8,     ,     ,    ,  alpha,
+I8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  un8,    , linear,
+L8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,     ,  un8,     ,    , linear,
+P4A4_UNORM_PALETTE0         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4, linear,
+A4P4_UNORM_PALETTE0         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4, linear,
+R8_SSCALED                  ,   8,  1,  1,  1,  ss8,     ,     ,     ,     ,     ,    , linear,
+R8_USCALED                  ,   8,  1,  1,  1,  us8,     ,     ,     ,     ,     ,    , linear,
+P8_UNORM_PALETTE0           ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     , un8, linear,
+L8_UNORM_SRGB               ,   8,  1,  1,  1,     ,     ,     ,     ,  un8,     ,    , linear,
+P8_UNORM_PALETTE1           ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     , un8, linear,
+P4A4_UNORM_PALETTE1         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4, linear,
+A4P4_UNORM_PALETTE1         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4, linear,
+Y8_UNORM                    ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,    yuv,
+L8_UINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,  ui8,     ,    , linear,
+L8_SINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,  si8,     ,    , linear,
+I8_UINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  ui8,    , linear,
+I8_SINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  si8,    , linear,
+DXT1_RGB_SRGB               ,  64,  4,  4,  1,  un4,  un4,  un4,     ,     ,     ,    ,   srgb,  dxt1
+R1_UNORM                    ,   1,  1,  1,  1,  un1,     ,     ,     ,     ,     ,    , linear,
+YCRCB_NORMAL                ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,    yuv,
+YCRCB_SWAPUVY               ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,    yuv,
+P2_UNORM_PALETTE0           ,   2,  1,  1,  1,     ,     ,     ,     ,     ,     , un2, linear,
+P2_UNORM_PALETTE1           ,   2,  1,  1,  1,     ,     ,     ,     ,     ,     , un2, linear,
+BC1_UNORM                   ,  64,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,  dxt1
+BC2_UNORM                   , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,  dxt3
+BC3_UNORM                   , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,  dxt5
+BC4_UNORM                   ,  64,  4,  4,  1,  un8,     ,     ,     ,     ,     ,    , linear, rgtc1
+BC5_UNORM                   , 128,  4,  4,  1,  un8,  un8,     ,     ,     ,     ,    , linear, rgtc2
+BC1_UNORM_SRGB              ,  64,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,   srgb,  dxt1
+BC2_UNORM_SRGB              , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,   srgb,  dxt3
+BC3_UNORM_SRGB              , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,   srgb,  dxt5
+MONO8                       ,   1,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,       ,
+YCRCB_SWAPUV                ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,    yuv,
+YCRCB_SWAPY                 ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,    yuv,
+DXT1_RGB                    ,  64,  4,  4,  1,  un4,  un4,  un4,     ,     ,     ,    , linear,  dxt1
+FXT1                        , 128,  8,  4,  1,  un4,  un4,  un4,     ,     ,     ,    , linear,  fxt1
+R8G8B8_UNORM                ,  24,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    , linear,
+R8G8B8_SNORM                ,  24,  1,  1,  1,  sn8,  sn8,  sn8,     ,     ,     ,    , linear,
+R8G8B8_SSCALED              ,  24,  1,  1,  1,  ss8,  ss8,  ss8,     ,     ,     ,    , linear,
+R8G8B8_USCALED              ,  24,  1,  1,  1,  us8,  us8,  us8,     ,     ,     ,    , linear,
+R64G64B64A64_FLOAT          , 256,  1,  1,  1, sf64, sf64, sf64, sf64,     ,     ,    , linear,
+R64G64B64_FLOAT             , 196,  1,  1,  1, sf64, sf64, sf64,     ,     ,     ,    , linear,
+BC4_SNORM                   ,  64,  4,  4,  1,  sn8,     ,     ,     ,     ,     ,    , linear, rgtc1
+BC5_SNORM                   , 128,  4,  4,  1,  sn8,  sn8,     ,     ,     ,     ,    , linear, rgtc2
+R16G16B16_FLOAT             ,  48,  1,  1,  1, sf16, sf16, sf16,     ,     ,     ,    , linear,
+R16G16B16_UNORM             ,  48,  1,  1,  1, un16, un16, un16,     ,     ,     ,    , linear,
+R16G16B16_SNORM             ,  48,  1,  1,  1, sn16, sn16, sn16,     ,     ,     ,    , linear,
+R16G16B16_SSCALED           ,  48,  1,  1,  1, ss16, ss16, ss16,     ,     ,     ,    , linear,
+R16G16B16_USCALED           ,  48,  1,  1,  1, us16, us16, us16,     ,     ,     ,    , linear,
+BC6H_SF16                   , 128,  4,  4,  1, sf16, sf16, sf16,     ,     ,     ,    , linear,  bptc
+BC7_UNORM                   , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    , linear,  bptc
+BC7_UNORM_SRGB              , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  bptc
+BC6H_UF16                   , 128,  4,  4,  1, uf16, uf16, uf16,     ,     ,     ,    , linear,  bptc
+PLANAR_420_8                ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,    yuv,
+R8G8B8_UNORM_SRGB           ,  24,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,   srgb,
+ETC1_RGB8                   ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    , linear,  etc1
+ETC2_RGB8                   ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    , linear,  etc2
+EAC_R11                     ,  64,  4,  4,  1, un11,     ,     ,     ,     ,     ,    , linear,  etc2
+EAC_RG11                    , 128,  4,  4,  1, un11, un11,     ,     ,     ,     ,    , linear,  etc2
+EAC_SIGNED_R11              ,  64,  4,  4,  1, sn11,     ,     ,     ,     ,     ,    , linear,  etc2
+EAC_SIGNED_RG11             , 128,  4,  4,  1, sn11, sn11,     ,     ,     ,     ,    , linear,  etc2
+ETC2_SRGB8                  ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    ,   srgb,  etc2
+R16G16B16_UINT              ,  48,  1,  1,  1, ui16, ui16, ui16,     ,     ,     ,    , linear,
+R16G16B16_SINT              ,  48,  1,  1,  1, si16, si16, si16,     ,     ,     ,    , linear,
+R32_SFIXED                  ,  32,  1,  1,  1, sx16,     ,     ,     ,     ,     ,    , linear,
+R10G10B10A2_SNORM           ,  32,  1,  1,  1, sn10, sn10, sn10,  sn2,     ,     ,    , linear,
+R10G10B10A2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,  us2,     ,     ,    , linear,
+R10G10B10A2_SSCALED         ,  32,  1,  1,  1, ss10, ss10, ss10,  ss2,     ,     ,    , linear,
+R10G10B10A2_SINT            ,  32,  1,  1,  1, si10, si10, si10,  si2,     ,     ,    , linear,
+B10G10R10A2_SNORM           ,  32,  1,  1,  1, sn10, sn10, sn10,  sn2,     ,     ,    , linear,
+B10G10R10A2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,  us2,     ,     ,    , linear,
+B10G10R10A2_SSCALED         ,  32,  1,  1,  1, ss10, ss10, ss10,  ss2,     ,     ,    , linear,
+B10G10R10A2_UINT            ,  32,  1,  1,  1, ui10, ui10, ui10,  ui2,     ,     ,    , linear,
+B10G10R10A2_SINT            ,  32,  1,  1,  1, si10, si10, si10,  si2,     ,     ,    , linear,
+R64G64B64A64_PASSTHRU       , 256,  1,  1,  1,  r64,  r64,  r64,  r64,     ,     ,    ,       ,
+R64G64B64_PASSTHRU          , 192,  1,  1,  1,  r64,  r64,  r64,     ,     ,     ,    ,       ,
+ETC2_RGB8_PTA               ,  64,  4,  4,  1,  un8,  un8,  un8,  un1,     ,     ,    , linear,  etc2
+ETC2_SRGB8_PTA              ,  64,  4,  4,  1,  un8,  un8,  un8,  un1,     ,     ,    ,   srgb,  etc2
+ETC2_EAC_RGBA8              , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    , linear,  etc2
+ETC2_EAC_SRGB8_A8           , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  etc2
+R8G8B8_UINT                 ,  24,  1,  1,  1,  ui8,  ui8,  ui8,     ,     ,     ,    , linear,
+R8G8B8_SINT                 ,  24,  1,  1,  1,  si8,  si8,  si8,     ,     ,     ,    , linear,
+RAW                         ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,       ,
diff --git a/src/isl/isl_format_layout_gen.bash b/src/isl/isl_format_layout_gen.bash

new file mode 100755 (executable)

index 0000000..db88382
--- /dev/null
+++ b/src/isl/isl_format_layout_gen.bash
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+#
+# Copyright 2015 Intel Corporation
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice (including the next
+#  paragraph) shall be included in all copies or substantial portions of the
+#  Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+#  IN THE SOFTWARE.
+
+set -eu
+set -o pipefail
+
+cat <<'EOF'
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include "isl.h"
+
+const struct isl_format_layout
+isl_format_layouts[] = {
+EOF
+
+sed -r '
+# Delete comment lines and empty lines
+/^[[:space:]]*#/d
+/^[[:space:]]*$/d
+
+# Delete spaces
+s/[[:space:]]//g
+
+# Translate formats
+s/^([A-Za-z0-9_]+),*/ISL_FORMAT_\1,/
+
+# Translate data type of channels
+s/\<x([0-9]+),/ISL_VOID@\1,/g
+s/\<r([0-9]+),/ISL_RAW@\1,/g
+s/\<un([0-9]+),/ISL_UNORM@\1,/g
+s/\<sn([0-9]+),/ISL_SNORM@\1,/g
+s/\<uf([0-9]+),/ISL_UFLOAT@\1,/g
+s/\<sf([0-9]+),/ISL_SFLOAT@\1,/g
+s/\<ux([0-9]+),/ISL_UFIXED@\1,/g
+s/\<sx([0-9]+),/ISL_SFIXED@\1,/g
+s/\<ui([0-9]+),/ISL_UINT@\1,/g
+s/\<si([0-9]+),/ISL_SINT@\1,/g
+s/\<us([0-9]+),/ISL_USCALED@\1,/g
+s/\<ss([0-9]+),/ISL_SSCALED@\1,/g
+
+# Translate colorspaces
+# Interpret alpha-only formats as having no colorspace.
+s/\<(linear|srgb|yuv)\>/ISL_COLORSPACE_\1/
+s/\<alpha\>//
+
+# Translate texture compression
+s/\<(dxt|fxt|rgtc|bptc|etc)([0-9]*)\>/ISL_TXC_\1\2/
+' |
+tr 'a-z' 'A-Z' | # Convert to uppersace
+while IFS=, read -r format bpb bw bh bd \
+                    red green blue alpha \
+                    luminance intensity palette \
+                    colorspace txc
+do
+    : ${colorspace:=ISL_COLORSPACE_NONE}
+    : ${txc:=ISL_TXC_NONE}
+
+    cat <<EOF
+   [$format] = {
+      $format,
+      .bs = $((bpb/8)),
+      .bw = $bw, .bh = $bh, .bd = $bd,
+      .channels = {
+          .r = { $red },
+          .g = { $green },
+          .b = { $blue },
+          .a = { $alpha },
+          .l = { $luminance },
+          .i = { $intensity },
+          .p = { $palette },
+      },
+      .colorspace = $colorspace,
+      .txc = $txc,
+   },
+
+EOF
+done |
+sed -r '
+# Collapse empty channels
+s/\{  \}/{}/
+
+# Split non-empty channels into two members: base type and bit size
+s/@/, /
+'
+
+# Terminate the table
+printf '};\n'
diff --git a/src/isl/isl_gen4.c b/src/isl/isl_gen4.c

new file mode 100644 (file)

index 0000000..52aa565
--- /dev/null
+++ b/src/isl/isl_gen4.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include "isl_gen4.h"
+#include "isl_priv.h"
+
+bool
+gen4_choose_msaa_layout(const struct isl_device *dev,
+                        const struct isl_surf_init_info *info,
+                        enum isl_tiling tiling,
+                        enum isl_msaa_layout *msaa_layout)
+{
+   /* Gen4 and Gen5 do not support MSAA */
+   assert(info->samples >= 1);
+
+   *msaa_layout = ISL_MSAA_LAYOUT_NONE;
+   return true;
+}
+
+void
+gen4_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el)
+{
+   assert(info->samples == 1);
+   assert(msaa_layout == ISL_MSAA_LAYOUT_NONE);
+   assert(!isl_tiling_is_std_y(tiling));
+
+   /* Note that neither the surface's horizontal nor vertical image alignment
+    * is programmable on gen4 nor gen5.
+    *
+    * From the G35 PRM (2008-01), Volume 1 Graphics Core, Section 6.17.3.4
+    * Alignment Unit Size:
+    *
+    *    Note that the compressed formats are padded to a full compression
+    *    cell.
+    *
+    *    +------------------------+--------+--------+
+    *    | format                 | halign | valign |
+    *    +------------------------+--------+--------+
+    *    | YUV 4:2:2 formats      |      4 |      2 |
+    *    | uncompressed formats   |      4 |      2 |
+    *    +------------------------+--------+--------+
+    */
+
+   if (isl_format_is_compressed(info->format)) {
+      *image_align_el = isl_extent3d(1, 1, 1);
+      return;
+   }
+
+   *image_align_el = isl_extent3d(4, 2, 1);
+}
diff --git a/src/isl/isl_gen4.h b/src/isl/isl_gen4.h

new file mode 100644 (file)

index 0000000..06cd70b
--- /dev/null
+++ b/src/isl/isl_gen4.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "isl_priv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+gen4_choose_msaa_layout(const struct isl_device *dev,
+                        const struct isl_surf_init_info *info,
+                        enum isl_tiling tiling,
+                        enum isl_msaa_layout *msaa_layout);
+
+void
+gen4_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/isl/isl_gen6.c b/src/isl/isl_gen6.c

new file mode 100644 (file)

index 0000000..24c3939
--- /dev/null
+++ b/src/isl/isl_gen6.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include "isl_gen6.h"
+#include "isl_priv.h"
+
+bool
+gen6_choose_msaa_layout(const struct isl_device *dev,
+                  const struct isl_surf_init_info *info,
+                  enum isl_tiling tiling,
+                  enum isl_msaa_layout *msaa_layout)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   assert(ISL_DEV_GEN(dev) == 6);
+   assert(info->samples >= 1);
+
+   if (info->samples == 1) {
+      *msaa_layout = ISL_MSAA_LAYOUT_NONE;
+      return false;
+   }
+
+   /* From the Sandybridge PRM, Volume 4 Part 1 p72, SURFACE_STATE, Surface
+    * Format:
+    *
+    *    If Number of Multisamples is set to a value other than
+    *    MULTISAMPLECOUNT_1, this field cannot be set to the following
+    *    formats:
+    *
+    *       - any format with greater than 64 bits per element
+    *       - any compressed texture format (BC*)
+    *       - any YCRCB* format
+    */
+   if (fmtl->bs > 8)
+      return false;
+   if (isl_format_is_compressed(info->format))
+      return false;
+   if (isl_format_is_yuv(info->format))
+      return false;
+
+   /* From the Sandybridge PRM, Volume 4 Part 1 p85, SURFACE_STATE, Number of
+    * Multisamples:
+    *
+    *    If this field is any value other than MULTISAMPLECOUNT_1 the
+    *    following restrictions apply:
+    *
+    *       - the Surface Type must be SURFTYPE_2D
+    *       - [...]
+    */
+   if (info->dim != ISL_SURF_DIM_2D)
+      return false;
+
+   /* More obvious restrictions */
+   if (isl_surf_usage_is_display(info->usage))
+      return false;
+   if (tiling == ISL_TILING_LINEAR)
+      return false;
+   if (info->levels > 1)
+      return false;
+
+   *msaa_layout = ISL_MSAA_LAYOUT_INTERLEAVED;
+   return true;
+}
+
+void
+gen6_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el)
+{
+   /* Note that the surface's horizontal image alignment is not programmable
+    * on Sandybridge.
+    *
+    * From the Sandybridge PRM (2011-05), Volume 1, Part 1, Section 7.18.3.4
+    * Alignment Unit Size:
+    *
+    *    Note that the compressed formats are padded to a full compression cell.
+    *
+    *    +------------------------+--------+--------+
+    *    | format                 | halign | valign |
+    *    +------------------------+--------+--------+
+    *    | YUV 4:2:2 formats      |      4 |      * |
+    *    | uncompressed formats   |      4 |      * |
+    *    +------------------------+--------+--------+
+    *
+    *    * For these formats, the vertical alignment factor “j” is determined
+    *      as follows:
+    *       - j = 4 for any depth buffer
+    *       - j = 2 for separate stencil buffer
+    *       - j = 4 for any render target surface is multisampled (4x)
+    *       - j = 2 for all other render target surface
+    *
+    * From the Sandrybridge PRM (2011-05), Volume 4, Part 1, Section 2.11.2
+    * SURFACE_STATE, Surface Vertical Alignment:
+    *
+    *    - This field must be set to VALIGN_2 if the Surface Format is 96 bits
+    *      per element (BPE).
+    *
+    *    - Value of 1 [VALIGN_4] is not supported for format YCRCB_NORMAL
+    *      (0x182), YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY
+    *      (0x190)
+    */
+
+   if (isl_format_is_compressed(info->format)) {
+      *image_align_el = isl_extent3d(1, 1, 1);
+      return;
+   }
+
+   if (isl_format_is_yuv(info->format)) {
+      *image_align_el = isl_extent3d(4, 2, 1);
+      return;
+   }
+
+   if (info->samples > 1) {
+      *image_align_el = isl_extent3d(4, 4, 1);
+      return;
+   }
+
+   if (isl_surf_usage_is_depth_or_stencil(info->usage) &&
+       !ISL_DEV_USE_SEPARATE_STENCIL(dev)) {
+      /* interleaved depthstencil buffer */
+      *image_align_el = isl_extent3d(4, 4, 1);
+      return;
+   }
+
+   if (isl_surf_usage_is_depth(info->usage)) {
+      /* separate depth buffer */
+      *image_align_el = isl_extent3d(4, 4, 1);
+      return;
+   }
+
+   if (isl_surf_usage_is_stencil(info->usage)) {
+      /* separate stencil buffer */
+      *image_align_el = isl_extent3d(4, 2, 1);
+      return;
+   }
+
+   *image_align_el = isl_extent3d(4, 2, 1);
+}
diff --git a/src/isl/isl_gen6.h b/src/isl/isl_gen6.h

new file mode 100644 (file)

index 0000000..0779c67
--- /dev/null
+++ b/src/isl/isl_gen6.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "isl_priv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+gen6_choose_msaa_layout(const struct isl_device *dev,
+                        const struct isl_surf_init_info *info,
+                        enum isl_tiling tiling,
+                        enum isl_msaa_layout *msaa_layout);
+
+void
+gen6_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/isl/isl_gen7.c b/src/isl/isl_gen7.c

new file mode 100644 (file)

index 0000000..7064e85
--- /dev/null
+++ b/src/isl/isl_gen7.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include "isl_gen7.h"
+#include "isl_priv.h"
+
+bool
+gen7_choose_msaa_layout(const struct isl_device *dev,
+                        const struct isl_surf_init_info *info,
+                        enum isl_tiling tiling,
+                        enum isl_msaa_layout *msaa_layout)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   bool require_array = false;
+   bool require_interleaved = false;
+
+   assert(ISL_DEV_GEN(dev) == 7);
+   assert(info->samples >= 1);
+
+   if (info->samples == 1) {
+      *msaa_layout = ISL_MSAA_LAYOUT_NONE;
+      return true;
+   }
+
+   /* From the Ivybridge PRM, Volume 4 Part 1 p63, SURFACE_STATE, Surface
+    * Format:
+    *
+    *    If Number of Multisamples is set to a value other than
+    *    MULTISAMPLECOUNT_1, this field cannot be set to the following
+    *    formats: any format with greater than 64 bits per element, any
+    *    compressed texture format (BC*), and any YCRCB* format.
+    */
+   if (fmtl->bs > 8)
+      return false;
+   if (isl_format_is_compressed(info->format))
+      return false;
+   if (isl_format_is_yuv(info->format))
+      return false;
+
+   /* From the Ivybridge PRM, Volume 4 Part 1 p73, SURFACE_STATE, Number of
+    * Multisamples:
+    *
+    *    - If this field is any value other than MULTISAMPLECOUNT_1, the
+    *      Surface Type must be SURFTYPE_2D.
+    *
+    *    - If this field is any value other than MULTISAMPLECOUNT_1, Surface
+    *      Min LOD, Mip Count / LOD, and Resource Min LOD must be set to zero
+    */
+   if (info->dim != ISL_SURF_DIM_2D)
+      return false;
+   if (info->levels > 1)
+      return false;
+
+   /* The Ivyrbridge PRM insists twice that signed integer formats cannot be
+    * multisampled.
+    *
+    * From the Ivybridge PRM, Volume 4 Part 1 p73, SURFACE_STATE, Number of
+    * Multisamples:
+    *
+    *    - This field must be set to MULTISAMPLECOUNT_1 for SINT MSRTs when
+    *      all RT channels are not written.
+    *
+    * And errata from the Ivybridge PRM, Volume 4 Part 1 p77,
+    * RENDER_SURFACE_STATE, MCS Enable:
+    *
+    *   This field must be set to 0 [MULTISAMPLECOUNT_1] for all SINT MSRTs
+    *   when all RT channels are not written.
+    *
+    * Note that the above SINT restrictions apply only to *MSRTs* (that is,
+    * *multisampled* render targets). The restrictions seem to permit an MCS
+    * if the render target is singlesampled.
+    */
+   if (isl_format_has_sint_channel(info->format))
+      return false;
+
+   /* More obvious restrictions */
+   if (isl_surf_usage_is_display(info->usage))
+      return false;
+   if (tiling == ISL_TILING_LINEAR)
+      return false;
+
+   /* From the Ivybridge PRM, Volume 4 Part 1 p72, SURFACE_STATE, Multisampled
+    * Suface Storage Format:
+    *
+    *    +---------------------+----------------------------------------------------------------+
+    *    | MSFMT_MSS           | Multsampled surface was/is rendered as a render target         |
+    *    | MSFMT_DEPTH_STENCIL | Multisampled surface was rendered as a depth or stencil buffer |
+    *    +---------------------+----------------------------------------------------------------+
+    *
+    * In the table above, MSFMT_MSS refers to ISL_MSAA_LAYOUT_ARRAY, and
+    * MSFMT_DEPTH_STENCIL refers to ISL_MSAA_LAYOUT_INTERLEAVED.
+    */
+   if (isl_surf_usage_is_depth_or_stencil(info->usage))
+      require_interleaved = true;
+
+   /* From the Ivybridge PRM, Volume 4 Part 1 p72, SURFACE_STATE, Multisampled
+    * Suface Storage Format:
+    *
+    *    If the surface’s Number of Multisamples is MULTISAMPLECOUNT_8, Width
+    *    is >= 8192 (meaning the actual surface width is >= 8193 pixels), this
+    *    field must be set to MSFMT_MSS.
+    */
+   if (info->samples == 8 && info->width == 8192)
+      require_array = true;
+
+   /* From the Ivybridge PRM, Volume 4 Part 1 p72, SURFACE_STATE, Multisampled
+    * Suface Storage Format:
+    *
+    *    If the surface’s Number of Multisamples is MULTISAMPLECOUNT_8,
+    *    ((Depth+1) * (Height+1)) is > 4,194,304, OR if the surface’s Number
+    *    of Multisamples is MULTISAMPLECOUNT_4, ((Depth+1) * (Height+1)) is
+    *    > 8,388,608, this field must be set to MSFMT_DEPTH_STENCIL.
+    */
+   if ((info->samples == 8 && info->height > 4194304u) ||
+       (info->samples == 4 && info->height > 8388608u))
+      require_interleaved = true;
+
+   /* From the Ivybridge PRM, Volume 4 Part 1 p72, SURFACE_STATE, Multisampled
+    * Suface Storage Format:
+    *
+    *    This field must be set to MSFMT_DEPTH_STENCIL if Surface Format is
+    *    one of the following: I24X8_UNORM, L24X8_UNORM, A24X8_UNORM, or
+    *    R24_UNORM_X8_TYPELESS.
+    */
+   if (info->format == ISL_FORMAT_I24X8_UNORM ||
+       info->format == ISL_FORMAT_L24X8_UNORM ||
+       info->format == ISL_FORMAT_A24X8_UNORM ||
+       info->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS)
+      require_interleaved = true;
+
+   if (require_array && require_interleaved)
+      return false;
+
+   if (require_interleaved) {
+      *msaa_layout = ISL_MSAA_LAYOUT_INTERLEAVED;
+      return true;
+   }
+
+   /* Default to the array layout because it permits multisample
+    * compression.
+    */
+   *msaa_layout = ISL_MSAA_LAYOUT_ARRAY;
+   return true;
+}
+
+static bool
+gen7_format_needs_valign2(const struct isl_device *dev,
+                          enum isl_format format)
+{
+   /* This workaround applies only to gen7 */
+   if (ISL_DEV_GEN(dev) > 7)
+      return false;
+
+   /* From the Ivybridge PRM (2012-05-31), Volume 4, Part 1, Section 2.12.1,
+    * RENDER_SURFACE_STATE Surface Vertical Alignment:
+    *
+    *    - Value of 1 [VALIGN_4] is not supported for format YCRCB_NORMAL
+    *      (0x182), YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY
+    *      (0x190)
+    *
+    *    - VALIGN_4 is not supported for surface format R32G32B32_FLOAT.
+    */
+   return isl_format_is_yuv(format) ||
+          format == ISL_FORMAT_R32G32B32_FLOAT;
+}
+
+/**
+ * @brief Filter out tiling flags that are incompatible with the surface.
+ *
+ * The resultant outgoing @a flags is a subset of the incoming @a flags. The
+ * outgoing flags may be empty (0x0) if the incoming flags were too
+ * restrictive.
+ *
+ * For example, if the surface will be used for a display
+ * (ISL_SURF_USAGE_DISPLAY_BIT), then this function filters out all tiling
+ * flags except ISL_TILING_X_BIT and ISL_TILING_LINEAR_BIT.
+ */
+void
+gen7_filter_tiling(const struct isl_device *dev,
+                   const struct isl_surf_init_info *restrict info,
+                   isl_tiling_flags_t *flags)
+{
+   /* IVB+ requires separate stencil */
+   assert(ISL_DEV_USE_SEPARATE_STENCIL(dev));
+
+   /* Clear flags unsupported on this hardware */
+   if (ISL_DEV_GEN(dev) < 9) {
+      *flags &= ~ISL_TILING_Yf_BIT;
+      *flags &= ~ISL_TILING_Ys_BIT;
+   }
+
+   /* And... clear the Yf and Ys bits anyway because Anvil doesn't support
+    * them yet.
+    */
+   *flags &= ~ISL_TILING_Yf_BIT; /* FINISHME[SKL]: Support Yf */
+   *flags &= ~ISL_TILING_Ys_BIT; /* FINISHME[SKL]: Support Ys */
+
+   if (isl_surf_usage_is_depth(info->usage)) {
+      /* Depth requires Y. */
+      *flags &= ISL_TILING_ANY_Y_MASK;
+   }
+
+   /* Separate stencil requires W tiling, and W tiling requires separate
+    * stencil.
+    */
+   if (isl_surf_usage_is_stencil(info->usage)) {
+      *flags &= ISL_TILING_W_BIT;
+   } else {
+      *flags &= ~ISL_TILING_W_BIT;
+   }
+
+   if (info->usage & (ISL_SURF_USAGE_DISPLAY_ROTATE_90_BIT |
+                      ISL_SURF_USAGE_DISPLAY_ROTATE_180_BIT |
+                      ISL_SURF_USAGE_DISPLAY_ROTATE_270_BIT)) {
+      assert(*flags & ISL_SURF_USAGE_DISPLAY_BIT);
+      isl_finishme("%s:%s: handle rotated display surfaces",
+                   __FILE__, __func__);
+   }
+
+   if (info->usage & (ISL_SURF_USAGE_DISPLAY_FLIP_X_BIT |
+                      ISL_SURF_USAGE_DISPLAY_FLIP_Y_BIT)) {
+      assert(*flags & ISL_SURF_USAGE_DISPLAY_BIT);
+      isl_finishme("%s:%s: handle flipped display surfaces",
+                   __FILE__, __func__);
+   }
+
+   if (info->usage & ISL_SURF_USAGE_DISPLAY_BIT) {
+      /* Before Skylake, the display engine does not accept Y */
+      /* FINISHME[SKL]: Y tiling for display surfaces */
+      *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_X_BIT);
+   }
+
+   if (info->samples > 1) {
+      /* From the Sandybridge PRM, Volume 4 Part 1, SURFACE_STATE Tiled
+       * Surface:
+       *
+       *   For multisample render targets, this field must be 1 (true). MSRTs
+       *   can only be tiled.
+       *
+       * Multisample surfaces never require X tiling, and Y tiling generally
+       * performs better than X. So choose Y. (Unless it's stencil, then it
+       * must be W).
+       */
+      *flags &= (ISL_TILING_ANY_Y_MASK | ISL_TILING_W_BIT);
+   }
+
+   /* workaround */
+   if (ISL_DEV_GEN(dev) == 7 &&
+       gen7_format_needs_valign2(dev, info->format) &&
+       (info->usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
+       info->samples == 1) {
+      /* Y tiling is illegal. From the Ivybridge PRM, Vol4 Part1 2.12.2.1,
+       * SURFACE_STATE Surface Vertical Alignment:
+       *
+       *     This field must be set to VALIGN_4 for all tiled Y Render Target
+       *     surfaces.
+       */
+      *flags &= ~ISL_TILING_Y0_BIT;
+   }
+}
+
+/**
+ * Choose horizontal subimage alignment, in units of surface elements.
+ */
+static uint32_t
+gen7_choose_halign_el(const struct isl_device *dev,
+                      const struct isl_surf_init_info *restrict info)
+{
+   if (isl_format_is_compressed(info->format))
+      return 1;
+
+   /* From the Ivybridge PRM (2012-05-31), Volume 4, Part 1, Section 2.12.1,
+    * RENDER_SURFACE_STATE Surface Hoizontal Alignment:
+    *
+    *    - This field is intended to be set to HALIGN_8 only if the surface
+    *      was rendered as a depth buffer with Z16 format or a stencil buffer,
+    *      since these surfaces support only alignment of 8. Use of HALIGN_8
+    *      for other surfaces is supported, but uses more memory.
+    */
+   if (isl_surf_info_is_z16(info) ||
+       isl_surf_usage_is_stencil(info->usage))
+      return 8;
+
+   return 4;
+}
+
+/**
+ * Choose vertical subimage alignment, in units of surface elements.
+ */
+static uint32_t
+gen7_choose_valign_el(const struct isl_device *dev,
+                      const struct isl_surf_init_info *restrict info,
+                      enum isl_tiling tiling)
+{
+   bool require_valign2 = false;
+   bool require_valign4 = false;
+
+   if (isl_format_is_compressed(info->format))
+      return 1;
+
+   if (gen7_format_needs_valign2(dev, info->format))
+      require_valign2 = true;
+
+   /* From the Ivybridge PRM, Volume 4, Part 1, Section 2.12.1:
+    * RENDER_SURFACE_STATE Surface Vertical Alignment:
+    *
+    *    - This field is intended to be set to VALIGN_4 if the surface was
+    *      rendered as a depth buffer, for a multisampled (4x) render target,
+    *      or for a multisampled (8x) render target, since these surfaces
+    *      support only alignment of 4.  Use of VALIGN_4 for other surfaces is
+    *      supported, but uses more memory.  This field must be set to
+    *      VALIGN_4 for all tiled Y Render Target surfaces.
+    *
+    */
+   if (isl_surf_usage_is_depth(info->usage) ||
+       info->samples > 1 ||
+       tiling == ISL_TILING_Y0) {
+      require_valign4 = true;
+   }
+
+   if (isl_surf_usage_is_stencil(info->usage)) {
+      /* The Ivybridge PRM states that the stencil buffer's vertical alignment
+       * is 8 [Ivybridge PRM, Volume 1, Part 1, Section 6.18.4.4 Alignment
+       * Unit Size]. However, valign=8 is outside the set of valid values of
+       * RENDER_SURFACE_STATE.SurfaceVerticalAlignment, which is VALIGN_2
+       * (0x0) and VALIGN_4 (0x1).
+       *
+       * The PRM is generally confused about the width, height, and alignment
+       * of the stencil buffer; and this confusion appears elsewhere. For
+       * example, the following PRM text effectively converts the stencil
+       * buffer's 8-pixel alignment to a 4-pixel alignment [Ivybridge PRM,
+       * Volume 1, Part 1, Section
+       * 6.18.4.2 Base Address and LOD Calculation]:
+       *
+       *    For separate stencil buffer, the width must be mutiplied by 2 and
+       *    height divided by 2 as follows:
+       *
+       *       w_L = 2*i*ceil(W_L/i)
+       *       h_L = 1/2*j*ceil(H_L/j)
+       *
+       * The root of the confusion is that, in W tiling, each pair of rows is
+       * interleaved into one.
+       *
+       * FINISHME(chadv): Decide to set valign=4 or valign=8 after isl's API
+       * is more polished.
+       */
+      require_valign4 = true;
+   }
+
+   assert(!require_valign2 || !require_valign4);
+
+   if (require_valign4)
+      return 4;
+
+   /* Prefer VALIGN_2 because it conserves memory. */
+   return 2;
+}
+
+void
+gen7_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el)
+{
+   /* IVB+ does not support combined depthstencil. */
+   assert(!isl_surf_usage_is_depth_and_stencil(info->usage));
+
+   *image_align_el = (struct isl_extent3d) {
+      .w = gen7_choose_halign_el(dev, info),
+      .h = gen7_choose_valign_el(dev, info, tiling),
+      .d = 1,
+   };
+}
diff --git a/src/isl/isl_gen7.h b/src/isl/isl_gen7.h

new file mode 100644 (file)

index 0000000..2a95b68
--- /dev/null
+++ b/src/isl/isl_gen7.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "isl_priv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+gen7_filter_tiling(const struct isl_device *dev,
+                   const struct isl_surf_init_info *restrict info,
+                   isl_tiling_flags_t *flags);
+
+bool
+gen7_choose_msaa_layout(const struct isl_device *dev,
+                        const struct isl_surf_init_info *info,
+                        enum isl_tiling tiling,
+                        enum isl_msaa_layout *msaa_layout);
+
+void
+gen7_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/isl/isl_gen8.c b/src/isl/isl_gen8.c

new file mode 100644 (file)

index 0000000..2f434aa
--- /dev/null
+++ b/src/isl/isl_gen8.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include "isl_gen8.h"
+#include "isl_priv.h"
+
+bool
+gen8_choose_msaa_layout(const struct isl_device *dev,
+                        const struct isl_surf_init_info *info,
+                        enum isl_tiling tiling,
+                        enum isl_msaa_layout *msaa_layout)
+{
+   bool require_array = false;
+   bool require_interleaved = false;
+
+   assert(info->samples >= 1);
+
+   if (info->samples == 1) {
+      *msaa_layout = ISL_MSAA_LAYOUT_NONE;
+      return true;
+   }
+
+   /* From the Broadwell PRM >> Volume2d: Command Structures >>
+    * RENDER_SURFACE_STATE Tile Mode:
+    *
+    *    - If Number of Multisamples is not MULTISAMPLECOUNT_1, this field
+    *      must be YMAJOR.
+    *
+    * As usual, though, stencil is special.
+    */
+   if (!isl_tiling_is_std_y(tiling) && !isl_surf_usage_is_stencil(info->usage))
+      return false;
+
+   /* From the Broadwell PRM >> Volume2d: Command Structures >>
+    * RENDER_SURFACE_STATE Multisampled Surface Storage Format:
+    *
+    *    All multisampled render target surfaces must have this field set to
+    *    MSFMT_MSS
+    */
+   if (info->usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)
+      require_array = true;
+
+   /* From the Broadwell PRM >> Volume2d: Command Structures >>
+    * RENDER_SURFACE_STATE Number of Multisamples:
+    *
+    *    - If this field is any value other than MULTISAMPLECOUNT_1, the
+    *      Surface Type must be SURFTYPE_2D This field must be set to
+    *      MULTISAMPLECOUNT_1 unless the surface is a Sampling Engine surface
+    *      or Render Target surface.
+    *
+    *    - If this field is any value other than MULTISAMPLECOUNT_1, Surface
+    *      Min LOD, Mip Count / LOD, and Resource Min LOD must be set to zero.
+    */
+   if (info->dim != ISL_SURF_DIM_2D)
+      return false;
+   if (info->levels > 1)
+      return false;
+
+   /* More obvious restrictions */
+   if (isl_surf_usage_is_display(info->usage))
+      return false;
+   if (isl_format_is_compressed(info->format))
+      return false;
+   if (isl_format_is_yuv(info->format))
+      return false;
+
+   if (isl_surf_usage_is_depth_or_stencil(info->usage))
+      require_interleaved = true;
+
+   if (require_array && require_interleaved)
+      return false;
+
+   if (require_interleaved) {
+      *msaa_layout = ISL_MSAA_LAYOUT_INTERLEAVED;
+      return true;
+   }
+
+   *msaa_layout = ISL_MSAA_LAYOUT_ARRAY;
+   return true;
+}
+
+/**
+ * Choose horizontal subimage alignment, in units of surface elements.
+ */
+static uint32_t
+gen8_choose_halign_el(const struct isl_device *dev,
+                      const struct isl_surf_init_info *restrict info)
+{
+   if (isl_format_is_compressed(info->format))
+      return 1;
+
+   /* From the Broadwell PRM, Volume 2d "Command Reference: Structures",
+    * RENDER_SURFACE_STATE Surface Horizontal Alignment, p326:
+    *
+    *    - This field is intended to be set to HALIGN_8 only if the surface
+    *      was rendered as a depth buffer with Z16 format or a stencil buffer.
+    *      In this case it must be set to HALIGN_8 since these surfaces
+    *      support only alignment of 8. [...]
+    */
+   if (isl_surf_info_is_z16(info))
+      return 8;
+   if (isl_surf_usage_is_stencil(info->usage))
+      return 8;
+
+   /* From the Broadwell PRM, Volume 2d "Command Reference: Structures",
+    * RENDER_SURFACE_STATE Surface Horizontal Alignment, p326:
+    *
+    *      [...] For Z32 formats it must be set to HALIGN_4.
+    */
+   if (isl_surf_usage_is_depth(info->usage))
+      return 4;
+
+   if (!(info->usage & ISL_SURF_USAGE_DISABLE_AUX_BIT)) {
+      /* From the Broadwell PRM, Volume 2d "Command Reference: Structures",
+       * RENDER_SURFACE_STATE Surface Horizontal Alignment, p326:
+       *
+       *    - When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E,
+       *      HALIGN 16 must be used.
+       *
+       * This case handles color surfaces that may own an auxiliary MCS, CCS_D,
+       * or CCS_E. Depth buffers, including those that own an auxiliary HiZ
+       * surface, are handled above and do not require HALIGN_16.
+       */
+      assert(!isl_surf_usage_is_depth(info->usage));
+      return 16;
+   }
+
+   /* XXX(chadv): I believe the hardware requires each image to be
+    * cache-aligned. If that's true, then defaulting to halign=4 is wrong for
+    * many formats. Depending on the format's block size, we may need to
+    * increase halign to 8.
+    */
+   return 4;
+}
+
+/**
+ * Choose vertical subimage alignment, in units of surface elements.
+ */
+static uint32_t
+gen8_choose_valign_el(const struct isl_device *dev,
+                      const struct isl_surf_init_info *restrict info)
+{
+   /* From the Broadwell PRM > Volume 2d: Command Reference: Structures
+    * > RENDER_SURFACE_STATE Surface Vertical Alignment (p325):
+    *
+    *    - For Sampling Engine and Render Target Surfaces: This field
+    *      specifies the vertical alignment requirement in elements for the
+    *      surface. [...] An element is defined as a pixel in uncompresed
+    *      surface formats, and as a compression block in compressed surface
+    *      formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
+    *      element is a sample.
+    *
+    *    - This field is intended to be set to VALIGN_4 if the surface was
+    *      rendered as a depth buffer, for a multisampled (4x) render target,
+    *      or for a multisampled (8x) render target, since these surfaces
+    *      support only alignment of 4. Use of VALIGN_4 for other surfaces is
+    *      supported, but increases memory usage.
+    *
+    *    - This field is intended to be set to VALIGN_8 only if the surface
+    *       was rendered as a stencil buffer, since stencil buffer surfaces
+    *       support only alignment of 8. If set to VALIGN_8, Surface Format
+    *       must be R8_UINT.
+    */
+
+   if (isl_format_is_compressed(info->format))
+      return 1;
+
+   if (isl_surf_usage_is_stencil(info->usage))
+      return 8;
+
+   return 4;
+}
+
+void
+gen8_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el)
+{
+   assert(!isl_tiling_is_std_y(tiling));
+
+   /* The below text from the Broadwell PRM provides some insight into the
+    * hardware's requirements for LOD alignment.  From the Broadwell PRM >>
+    * Volume 5: Memory Views >> Surface Layout >> 2D Surfaces:
+    *
+    *    These [2D surfaces] must adhere to the following memory organization
+    *    rules:
+    *
+    *       - For non-compressed texture formats, each mipmap must start on an
+    *         even row within the monolithic rectangular area. For
+    *         1-texel-high mipmaps, this may require a row of padding below
+    *         the previous mipmap. This restriction does not apply to any
+    *         compressed texture formats; each subsequent (lower-res)
+    *         compressed mipmap is positioned directly below the previous
+    *         mipmap.
+    *
+    *       - Vertical alignment restrictions vary with memory tiling type:
+    *         1 DWord for linear, 16-byte (DQWord) for tiled. (Note that tiled
+    *         mipmaps are not required to start at the left edge of a tile
+    *         row.)
+    */
+
+   *image_align_el = (struct isl_extent3d) {
+      .w = gen8_choose_halign_el(dev, info),
+      .h = gen8_choose_valign_el(dev, info),
+      .d = 1,
+   };
+}
diff --git a/src/isl/isl_gen8.h b/src/isl/isl_gen8.h

new file mode 100644 (file)

index 0000000..2017ea8
--- /dev/null
+++ b/src/isl/isl_gen8.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "isl_priv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+gen8_choose_msaa_layout(const struct isl_device *dev,
+                        const struct isl_surf_init_info *info,
+                        enum isl_tiling tiling,
+                        enum isl_msaa_layout *msaa_layout);
+
+void
+gen8_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/isl/isl_gen9.c b/src/isl/isl_gen9.c

new file mode 100644 (file)

index 0000000..aa290aa
--- /dev/null
+++ b/src/isl/isl_gen9.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#include "isl_gen8.h"
+#include "isl_gen9.h"
+#include "isl_priv.h"
+
+/**
+ * Calculate the surface's subimage alignment, in units of surface samples,
+ * for the standard tiling formats Yf and Ys.
+ */
+static void
+gen9_calc_std_image_alignment_sa(const struct isl_device *dev,
+                                 const struct isl_surf_init_info *restrict info,
+                                 enum isl_tiling tiling,
+                                 enum isl_msaa_layout msaa_layout,
+                                 struct isl_extent3d *align_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
+
+   assert(isl_tiling_is_std_y(tiling));
+
+   const uint32_t bs = fmtl->bs;
+   const uint32_t is_Ys = tiling == ISL_TILING_Ys;
+
+   switch (info->dim) {
+   case ISL_SURF_DIM_1D:
+      /* See the Skylake BSpec > Memory Views > Common Surface Formats > Surface
+       * Layout and Tiling > 1D Surfaces > 1D Alignment Requirements.
+       */
+      *align_sa = (struct isl_extent3d) {
+         .w = 1 << (12 - (ffs(bs) - 1) + (4 * is_Ys)),
+         .h = 1,
+         .d = 1,
+      };
+      return;
+   case ISL_SURF_DIM_2D:
+      /* See the Skylake BSpec > Memory Views > Common Surface Formats >
+       * Surface Layout and Tiling > 2D Surfaces > 2D/CUBE Alignment
+       * Requirements.
+       */
+      *align_sa = (struct isl_extent3d) {
+         .w = 1 << (6 - ((ffs(bs) - 1) / 2) + (4 * is_Ys)),
+         .h = 1 << (6 - ((ffs(bs) - 0) / 2) + (4 * is_Ys)),
+         .d = 1,
+      };
+
+      if (is_Ys) {
+         /* FINISHME(chadv): I don't trust this code. Untested. */
+         isl_finishme("%s:%s: [SKL+] multisample TileYs", __FILE__, __func__);
+
+         switch (msaa_layout) {
+         case ISL_MSAA_LAYOUT_NONE:
+         case ISL_MSAA_LAYOUT_INTERLEAVED:
+            break;
+         case ISL_MSAA_LAYOUT_ARRAY:
+            align_sa->w >>= (ffs(info->samples) - 0) / 2;
+            align_sa->h >>= (ffs(info->samples) - 1) / 2;
+            break;
+         }
+      }
+      return;
+
+   case ISL_SURF_DIM_3D:
+      /* See the Skylake BSpec > Memory Views > Common Surface Formats > Surface
+       * Layout and Tiling > 1D Surfaces > 1D Alignment Requirements.
+       */
+      *align_sa = (struct isl_extent3d) {
+         .w = 1 << (4 - ((ffs(bs) + 1) / 3) + (4 * is_Ys)),
+         .h = 1 << (4 - ((ffs(bs) - 1) / 3) + (2 * is_Ys)),
+         .d = 1 << (4 - ((ffs(bs) - 0) / 3) + (2 * is_Ys)),
+      };
+      return;
+   }
+
+   unreachable("bad isl_surface_type");
+}
+
+void
+gen9_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el)
+{
+   /* This BSpec text provides some insight into the hardware's alignment
+    * requirements [Skylake BSpec > Memory Views > Common Surface Formats >
+    * Surface Layout and Tiling > 2D Surfaces]:
+    *
+    *    An LOD must be aligned to a cache-line except for some special cases
+    *    related to Planar YUV surfaces.  In general, the cache-alignment
+    *    restriction implies there is a minimum height for an LOD of 4 texels.
+    *    So, LODs which are smaller than 4 high are padded.
+    *
+    * From the Skylake BSpec, RENDER_SURFACE_STATE Surface Vertical Alignment:
+    *
+    *    - For Sampling Engine and Render Target Surfaces: This field
+    *      specifies the vertical alignment requirement in elements for the
+    *      surface. [...] An element is defined as a pixel in uncompresed
+    *      surface formats, and as a compression block in compressed surface
+    *      formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
+    *      element is a sample.
+    *
+    *    - This field is used for 2D, CUBE, and 3D surface alignment when Tiled
+    *      Resource Mode is TRMODE_NONE (Tiled Resource Mode is disabled).
+    *      This field is ignored for 1D surfaces and also when Tiled Resource
+    *      Mode is not TRMODE_NONE (e.g. Tiled Resource Mode is enabled).
+    *
+    *      See the appropriate Alignment  table in the "Surface Layout and
+    *      Tiling" section under Common Surface Formats for the table of
+    *      alignment values for Tiled Resrouces.
+    *
+    *    - For uncompressed surfaces, the units of "j" are rows of pixels on
+    *      the physical surface. For compressed texture formats, the units of
+    *      "j" are in compression blocks, thus each increment in "j" is equal
+    *      to h pixels, where h is the height of the compression block in
+    *      pixels.
+    *
+    *    - Valid Values: VALIGN_4, VALIGN_8, VALIGN_16
+    *
+    * From the Skylake BSpec, RENDER_SURFACE_STATE Surface Horizontal
+    * Alignment:
+    *
+    *    -  For uncompressed surfaces, the units of "i" are pixels on the
+    *       physical surface. For compressed texture formats, the units of "i"
+    *       are in compression blocks, thus each increment in "i" is equal to
+    *       w pixels, where w is the width of the compression block in pixels.
+    *
+    *    - Valid Values: HALIGN_4, HALIGN_8, HALIGN_16
+    */
+
+   if (isl_tiling_is_std_y(tiling)) {
+      struct isl_extent3d image_align_sa;
+      gen9_calc_std_image_alignment_sa(dev, info, tiling, msaa_layout,
+                                     &image_align_sa);
+
+      *image_align_el = isl_extent3d_sa_to_el(info->format, image_align_sa);
+      return;
+   }
+
+   if (info->dim == ISL_SURF_DIM_1D) {
+      /* See the Skylake BSpec > Memory Views > Common Surface Formats > Surface
+       * Layout and Tiling > 1D Surfaces > 1D Alignment Requirements.
+       */
+      *image_align_el = isl_extent3d(64, 1, 1);
+      return;
+   }
+
+   if (isl_format_is_compressed(info->format)) {
+      /* On Gen9, the meaning of RENDER_SURFACE_STATE's
+       * SurfaceHorizontalAlignment and SurfaceVerticalAlignment changed for
+       * compressed formats. They now indicate a multiple of the compression
+       * block.  For example, if the compression mode is ETC2 then HALIGN_4
+       * indicates a horizontal alignment of 16 pixels.
+       *
+       * To avoid wasting memory, choose the smallest alignment possible:
+       * HALIGN_4 and VALIGN_4.
+       */
+      *image_align_el = isl_extent3d(4, 4, 1);
+      return;
+   }
+
+   gen8_choose_image_alignment_el(dev, info, tiling, msaa_layout,
+                                  image_align_el);
+}
diff --git a/src/isl/isl_gen9.h b/src/isl/isl_gen9.h

new file mode 100644 (file)

index 0000000..64ed0aa
--- /dev/null
+++ b/src/isl/isl_gen9.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "isl_priv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+gen9_choose_image_alignment_el(const struct isl_device *dev,
+                               const struct isl_surf_init_info *restrict info,
+                               enum isl_tiling tiling,
+                               enum isl_msaa_layout msaa_layout,
+                               struct isl_extent3d *image_align_el);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/isl/isl_image.c b/src/isl/isl_image.c

new file mode 100644 (file)

index 0000000..7731604
--- /dev/null
+++ b/src/isl/isl_image.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "isl.h"
+#include "brw_compiler.h"
+
+bool
+isl_is_storage_image_format(enum isl_format format)
+{
+   /* XXX: Maybe we should put this in the CSV? */
+
+   switch (format) {
+   case ISL_FORMAT_R32G32B32A32_UINT:
+   case ISL_FORMAT_R32G32B32A32_SINT:
+   case ISL_FORMAT_R32G32B32A32_FLOAT:
+   case ISL_FORMAT_R32_UINT:
+   case ISL_FORMAT_R32_SINT:
+   case ISL_FORMAT_R32_FLOAT:
+   case ISL_FORMAT_R16G16B16A16_UINT:
+   case ISL_FORMAT_R16G16B16A16_SINT:
+   case ISL_FORMAT_R16G16B16A16_FLOAT:
+   case ISL_FORMAT_R32G32_UINT:
+   case ISL_FORMAT_R32G32_SINT:
+   case ISL_FORMAT_R32G32_FLOAT:
+   case ISL_FORMAT_R8G8B8A8_UINT:
+   case ISL_FORMAT_R8G8B8A8_SINT:
+   case ISL_FORMAT_R16G16_UINT:
+   case ISL_FORMAT_R16G16_SINT:
+   case ISL_FORMAT_R16G16_FLOAT:
+   case ISL_FORMAT_R8G8_UINT:
+   case ISL_FORMAT_R8G8_SINT:
+   case ISL_FORMAT_R16_UINT:
+   case ISL_FORMAT_R16_FLOAT:
+   case ISL_FORMAT_R16_SINT:
+   case ISL_FORMAT_R8_UINT:
+   case ISL_FORMAT_R8_SINT:
+   case ISL_FORMAT_R10G10B10A2_UINT:
+   case ISL_FORMAT_R10G10B10A2_UNORM:
+   case ISL_FORMAT_R11G11B10_FLOAT:
+   case ISL_FORMAT_R16G16B16A16_UNORM:
+   case ISL_FORMAT_R16G16B16A16_SNORM:
+   case ISL_FORMAT_R8G8B8A8_UNORM:
+   case ISL_FORMAT_R8G8B8A8_SNORM:
+   case ISL_FORMAT_R16G16_UNORM:
+   case ISL_FORMAT_R16G16_SNORM:
+   case ISL_FORMAT_R8G8_UNORM:
+   case ISL_FORMAT_R8G8_SNORM:
+   case ISL_FORMAT_R16_UNORM:
+   case ISL_FORMAT_R16_SNORM:
+   case ISL_FORMAT_R8_UNORM:
+   case ISL_FORMAT_R8_SNORM:
+      return true;
+   default:
+      return false;
+   }
+}
+
+enum isl_format
+isl_lower_storage_image_format(const struct isl_device *dev,
+                               enum isl_format format)
+{
+   switch (format) {
+   /* These are never lowered.  Up to BDW we'll have to fall back to untyped
+    * surface access for 128bpp formats.
+    */
+   case ISL_FORMAT_R32G32B32A32_UINT:
+   case ISL_FORMAT_R32G32B32A32_SINT:
+   case ISL_FORMAT_R32G32B32A32_FLOAT:
+   case ISL_FORMAT_R32_UINT:
+   case ISL_FORMAT_R32_SINT:
+   case ISL_FORMAT_R32_FLOAT:
+      return format;
+
+   /* From HSW to BDW the only 64bpp format supported for typed access is
+    * RGBA_UINT16.  IVB falls back to untyped.
+    */
+   case ISL_FORMAT_R16G16B16A16_UINT:
+   case ISL_FORMAT_R16G16B16A16_SINT:
+   case ISL_FORMAT_R16G16B16A16_FLOAT:
+   case ISL_FORMAT_R32G32_UINT:
+   case ISL_FORMAT_R32G32_SINT:
+   case ISL_FORMAT_R32G32_FLOAT:
+      return (ISL_DEV_GEN(dev) >= 9 ? format :
+              ISL_DEV_GEN(dev) >= 8 || dev->info->is_haswell ?
+              ISL_FORMAT_R16G16B16A16_UINT :
+              ISL_FORMAT_R32G32_UINT);
+
+   /* Up to BDW no SINT or FLOAT formats of less than 32 bits per component
+    * are supported.  IVB doesn't support formats with more than one component
+    * for typed access.  For 8 and 16 bpp formats IVB relies on the
+    * undocumented behavior that typed reads from R_UINT8 and R_UINT16
+    * surfaces actually do a 32-bit misaligned read.  The alternative would be
+    * to use two surface state entries with different formats for each image,
+    * one for reading (using R_UINT32) and another one for writing (using
+    * R_UINT8 or R_UINT16), but that would complicate the shaders we generate
+    * even more.
+    */
+   case ISL_FORMAT_R8G8B8A8_UINT:
+   case ISL_FORMAT_R8G8B8A8_SINT:
+      return (ISL_DEV_GEN(dev) >= 9 ? format :
+              ISL_DEV_GEN(dev) >= 8 || dev->info->is_haswell ?
+              ISL_FORMAT_R8G8B8A8_UINT : ISL_FORMAT_R32_UINT);
+
+   case ISL_FORMAT_R16G16_UINT:
+   case ISL_FORMAT_R16G16_SINT:
+   case ISL_FORMAT_R16G16_FLOAT:
+      return (ISL_DEV_GEN(dev) >= 9 ? format :
+              ISL_DEV_GEN(dev) >= 8 || dev->info->is_haswell ?
+              ISL_FORMAT_R16G16_UINT : ISL_FORMAT_R32_UINT);
+
+   case ISL_FORMAT_R8G8_UINT:
+   case ISL_FORMAT_R8G8_SINT:
+      return (ISL_DEV_GEN(dev) >= 9 ? format :
+              ISL_DEV_GEN(dev) >= 8 || dev->info->is_haswell ?
+              ISL_FORMAT_R8G8_UINT : ISL_FORMAT_R16_UINT);
+
+   case ISL_FORMAT_R16_UINT:
+   case ISL_FORMAT_R16_FLOAT:
+   case ISL_FORMAT_R16_SINT:
+      return (ISL_DEV_GEN(dev) >= 9 ? format : ISL_FORMAT_R16_UINT);
+
+   case ISL_FORMAT_R8_UINT:
+   case ISL_FORMAT_R8_SINT:
+      return (ISL_DEV_GEN(dev) >= 9 ? format : ISL_FORMAT_R8_UINT);
+
+   /* Neither the 2/10/10/10 nor the 11/11/10 packed formats are supported
+    * by the hardware.
+    */
+   case ISL_FORMAT_R10G10B10A2_UINT:
+   case ISL_FORMAT_R10G10B10A2_UNORM:
+   case ISL_FORMAT_R11G11B10_FLOAT:
+      return ISL_FORMAT_R32_UINT;
+
+   /* No normalized fixed-point formats are supported by the hardware. */
+   case ISL_FORMAT_R16G16B16A16_UNORM:
+   case ISL_FORMAT_R16G16B16A16_SNORM:
+      return (ISL_DEV_GEN(dev) >= 8 || dev->info->is_haswell ?
+              ISL_FORMAT_R16G16B16A16_UINT :
+              ISL_FORMAT_R32G32_UINT);
+
+   case ISL_FORMAT_R8G8B8A8_UNORM:
+   case ISL_FORMAT_R8G8B8A8_SNORM:
+      return (ISL_DEV_GEN(dev) >= 8 || dev->info->is_haswell ?
+              ISL_FORMAT_R8G8B8A8_UINT : ISL_FORMAT_R32_UINT);
+
+   case ISL_FORMAT_R16G16_UNORM:
+   case ISL_FORMAT_R16G16_SNORM:
+      return (ISL_DEV_GEN(dev) >= 8 || dev->info->is_haswell ?
+              ISL_FORMAT_R16G16_UINT : ISL_FORMAT_R32_UINT);
+
+   case ISL_FORMAT_R8G8_UNORM:
+   case ISL_FORMAT_R8G8_SNORM:
+      return (ISL_DEV_GEN(dev) >= 8 || dev->info->is_haswell ?
+              ISL_FORMAT_R8G8_UINT : ISL_FORMAT_R16_UINT);
+
+   case ISL_FORMAT_R16_UNORM:
+   case ISL_FORMAT_R16_SNORM:
+      return ISL_FORMAT_R16_UINT;
+
+   case ISL_FORMAT_R8_UNORM:
+   case ISL_FORMAT_R8_SNORM:
+      return ISL_FORMAT_R8_UINT;
+
+   default:
+      assert(!"Unknown image format");
+      return ISL_FORMAT_UNSUPPORTED;
+   }
+}
diff --git a/src/isl/isl_priv.h b/src/isl/isl_priv.h

new file mode 100644 (file)

index 0000000..b399e0f
--- /dev/null
+++ b/src/isl/isl_priv.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a
+ *  copy of this software and associated documentation files (the "Software"),
+ *  to deal in the Software without restriction, including without limitation
+ *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *  and/or sell copies of the Software, and to permit persons to whom the
+ *  Software is furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice (including the next
+ *  paragraph) shall be included in all copies or substantial portions of the
+ *  Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <assert.h>
+
+#include "brw_device_info.h"
+#include "mesa/main/imports.h"
+#include "util/macros.h"
+
+#include "isl.h"
+
+#define isl_finishme(format, ...) \
+   __isl_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__)
+
+void PRINTFLIKE(3, 4) UNUSED
+__isl_finishme(const char *file, int line, const char *fmt, ...);
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static inline uint32_t
+ffs(uint32_t n) {
+   return __builtin_ffs(n);
+}
+
+static inline bool
+isl_is_pow2(uintmax_t n)
+{
+   return !(n & (n - 1));
+}
+
+/**
+ * Alignment must be a power of 2.
+ */
+static inline bool
+isl_is_aligned(uintmax_t n, uintmax_t a)
+{
+   assert(isl_is_pow2(a));
+   return (n & (a - 1)) == 0;
+}
+
+/**
+ * Alignment must be a power of 2.
+ */
+static inline uintmax_t
+isl_align(uintmax_t n, uintmax_t a)
+{
+   assert(a != 0 && isl_is_pow2(a));
+   return (n + a - 1) & ~(a - 1);
+}
+
+static inline uintmax_t
+isl_align_npot(uintmax_t n, uintmax_t a)
+{
+   assert(a > 0);
+   return ((n + a - 1) / a) * a;
+}
+
+/**
+ * Alignment must be a power of 2.
+ */
+static inline uintmax_t
+isl_align_div(uintmax_t n, uintmax_t a)
+{
+   return isl_align(n, a) / a;
+}
+
+static inline uintmax_t
+isl_align_div_npot(uintmax_t n, uintmax_t a)
+{
+   return isl_align_npot(n, a) / a;
+}
+
+/**
+ * Log base 2, rounding towards zero.
+ */
+static inline uint32_t
+isl_log2u(uint32_t n)
+{
+   assert(n != 0);
+   return 31 - __builtin_clz(n);
+}
+
+static inline uint32_t
+isl_minify(uint32_t n, uint32_t levels)
+{
+   if (unlikely(n == 0))
+      return 0;
+   else
+      return MAX(n >> levels, 1);
+}
+
+static inline struct isl_extent3d
+isl_extent3d_sa_to_el(enum isl_format fmt, struct isl_extent3d extent_sa)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   assert(extent_sa.w % fmtl->bw == 0);
+   assert(extent_sa.h % fmtl->bh == 0);
+   assert(extent_sa.d % fmtl->bd == 0);
+
+   return (struct isl_extent3d) {
+      .w = extent_sa.w / fmtl->bw,
+      .h = extent_sa.h / fmtl->bh,
+      .d = extent_sa.d / fmtl->bd,
+   };
+}
+
+static inline struct isl_extent3d
+isl_extent3d_el_to_sa(enum isl_format fmt, struct isl_extent3d extent_el)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return (struct isl_extent3d) {
+      .w = extent_el.w * fmtl->bw,
+      .h = extent_el.h * fmtl->bh,
+      .d = extent_el.d * fmtl->bd,
+   };
+}
diff --git a/src/isl/tests/.gitignore b/src/isl/tests/.gitignore

new file mode 100644 (file)

index 0000000..ba70ecf
--- /dev/null
+++ b/src/isl/tests/.gitignore
@@ -0,0 +1 @@
+/isl_surf_get_image_offset_test
diff --git a/src/isl/tests/isl_surf_get_image_offset_test.c b/src/isl/tests/isl_surf_get_image_offset_test.c

new file mode 100644 (file)

index 0000000..78362be
--- /dev/null
+++ b/src/isl/tests/isl_surf_get_image_offset_test.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "brw_device_info.h"
+#include "isl.h"
+
+#define BDW_GT2_DEVID 0x161a
+
+// An asssert that works regardless of NDEBUG.
+#define t_assert(cond) \
+   do { \
+      if (!(cond)) { \
+         fprintf(stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \
+         abort(); \
+      } \
+   } while (0)
+
+static void
+t_assert_extent4d(const struct isl_extent4d *e, uint32_t width,
+                  uint32_t height, uint32_t depth, uint32_t array_len)
+{
+   t_assert(e->width == width);
+   t_assert(e->height == height);
+   t_assert(e->depth == depth);
+   t_assert(e->array_len == array_len);
+}
+
+static void
+t_assert_image_alignment_el(const struct isl_surf *surf,
+                            uint32_t w, uint32_t h, uint32_t d)
+{
+   struct isl_extent3d align_el;
+
+   align_el = isl_surf_get_image_alignment_el(surf);
+   t_assert(align_el.w == w);
+   t_assert(align_el.h == h);
+   t_assert(align_el.d == d);
+
+}
+
+static void
+t_assert_image_alignment_sa(const struct isl_surf *surf,
+                            uint32_t w, uint32_t h, uint32_t d)
+{
+   struct isl_extent3d align_sa;
+
+   align_sa = isl_surf_get_image_alignment_sa(surf);
+   t_assert(align_sa.w == w);
+   t_assert(align_sa.h == h);
+   t_assert(align_sa.d == d);
+
+}
+
+static void
+t_assert_offset(const struct isl_surf *surf,
+                uint32_t level,
+                uint32_t logical_array_layer,
+                uint32_t logical_z_offset_px,
+                uint32_t expected_x_offset_sa,
+                uint32_t expected_y_offset_sa)
+{
+   uint32_t x, y;
+   isl_surf_get_image_offset_sa(surf, level, logical_array_layer,
+                                logical_z_offset_px, &x, &y);
+
+   t_assert(x == expected_x_offset_sa);
+   t_assert(y == expected_y_offset_sa);
+}
+
+static void
+t_assert_phys_level0_sa(const struct isl_surf *surf, uint32_t width,
+                        uint32_t height, uint32_t depth, uint32_t array_len)
+{
+   t_assert_extent4d(&surf->phys_level0_sa, width, height, depth, array_len);
+}
+
+static void
+test_bdw_2d_r8g8b8a8_unorm_512x512_a1_s1_noaux_y0(void)
+{
+   bool ok;
+
+   struct isl_device dev;
+   isl_device_init(&dev, brw_get_device_info(BDW_GT2_DEVID));
+
+   struct isl_surf surf;
+   ok = isl_surf_init(&dev, &surf,
+                      .dim = ISL_SURF_DIM_2D,
+                      .format = ISL_FORMAT_R8G8B8A8_UNORM,
+                      .width = 512,
+                      .height = 512,
+                      .depth = 1,
+                      .levels = 10,
+                      .array_len = 1,
+                      .samples = 1,
+                      .usage = ISL_SURF_USAGE_TEXTURE_BIT |
+                               ISL_SURF_USAGE_DISABLE_AUX_BIT,
+                      .tiling_flags = ISL_TILING_Y0_BIT);
+   t_assert(ok);
+
+   t_assert_image_alignment_el(&surf, 4, 4, 1);
+   t_assert_image_alignment_sa(&surf, 4, 4, 1);
+   t_assert_phys_level0_sa(&surf, 512, 512, 1, 1);
+   t_assert(isl_surf_get_array_pitch_el_rows(&surf) >= 772);
+   t_assert(isl_surf_get_array_pitch_sa_rows(&surf) >= 772);
+
+   t_assert_offset(&surf, 0, 0, 0, 0, 0); // +0, +0
+   t_assert_offset(&surf, 1, 0, 0, 0, 512); // +0, +512
+   t_assert_offset(&surf, 2, 0, 0, 256, 512); // +256, +0
+   t_assert_offset(&surf, 3, 0, 0, 256, 640); // +0, +128
+   t_assert_offset(&surf, 4, 0, 0, 256, 704); // +0, +64
+   t_assert_offset(&surf, 5, 0, 0, 256, 736); // +0, +32
+   t_assert_offset(&surf, 6, 0, 0, 256, 752); // +0, +16
+   t_assert_offset(&surf, 7, 0, 0, 256, 760); // +0, +8
+   t_assert_offset(&surf, 8, 0, 0, 256, 764); // +0, +4
+   t_assert_offset(&surf, 9, 0, 0, 256, 768); // +0, +4
+}
+
+static void
+test_bdw_2d_r8g8b8a8_unorm_1024x1024_a6_s1_noaux_y0(void)
+{
+   bool ok;
+
+   struct isl_device dev;
+   isl_device_init(&dev, brw_get_device_info(BDW_GT2_DEVID));
+
+   struct isl_surf surf;
+   ok = isl_surf_init(&dev, &surf,
+                      .dim = ISL_SURF_DIM_2D,
+                      .format = ISL_FORMAT_R8G8B8A8_UNORM,
+                      .width = 1024,
+                      .height = 1024,
+                      .depth = 1,
+                      .levels = 11,
+                      .array_len = 6,
+                      .samples = 1,
+                      .usage = ISL_SURF_USAGE_TEXTURE_BIT |
+                               ISL_SURF_USAGE_DISABLE_AUX_BIT,
+                      .tiling_flags = ISL_TILING_Y0_BIT);
+   t_assert(ok);
+
+   t_assert_image_alignment_el(&surf, 4, 4, 1);
+   t_assert_image_alignment_sa(&surf, 4, 4, 1);
+   t_assert_image_alignment_sa(&surf, 4, 4, 1);
+   t_assert(isl_surf_get_array_pitch_el_rows(&surf) >= 1540);
+   t_assert(isl_surf_get_array_pitch_sa_rows(&surf) >= 1540);
+
+   for (uint32_t a = 0; a < 6; ++a) {
+      uint32_t b = a * isl_surf_get_array_pitch_sa_rows(&surf);
+
+      t_assert_offset(&surf, 0, a, 0, 0, b + 0); // +0, +0
+      t_assert_offset(&surf, 1, a, 0, 0, b + 1024); // +0, +1024
+      t_assert_offset(&surf, 2, a, 0, 512, b + 1024); // +512, +0
+      t_assert_offset(&surf, 3, a, 0, 512, b + 1280); // +0, +256
+      t_assert_offset(&surf, 4, a, 0, 512, b + 1408); // +0, +128
+      t_assert_offset(&surf, 5, a, 0, 512, b + 1472); // +0, +64
+      t_assert_offset(&surf, 6, a, 0, 512, b + 1504); // +0, +32
+      t_assert_offset(&surf, 7, a, 0, 512, b + 1520); // +0, +16
+      t_assert_offset(&surf, 8, a, 0, 512, b + 1528); // +0, +8
+      t_assert_offset(&surf, 9, a, 0, 512, b + 1532); // +0, +4
+      t_assert_offset(&surf, 10, a, 0, 512, b + 1536); // +0, +4
+   }
+}
+
+int main(void)
+{
+   /* FINISHME: Add tests for npot sizes */
+   /* FINISHME: Add tests for 1D surfaces */
+   /* FINISHME: Add tests for 3D surfaces */
+
+   test_bdw_2d_r8g8b8a8_unorm_512x512_a1_s1_noaux_y0();
+   test_bdw_2d_r8g8b8a8_unorm_1024x1024_a6_s1_noaux_y0();
+}
diff --git a/src/mapi/glapi/gen/ARB_indirect_parameters.xml b/src/mapi/glapi/gen/ARB_indirect_parameters.xml

new file mode 100644 (file)

index 0000000..20de905
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_indirect_parameters.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<OpenGLAPI>
+
+<category name="GL_ARB_draw_indirect" number="154">
+
+    <enum name="PARAMETER_BUFFER_ARB"                   value="0x80EE"/>
+    <enum name="PARAMETER_BUFFER_BINDING_ARB"           value="0x80EF"/>
+
+    <function name="MultiDrawArraysIndirectCountARB" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="indirect" type="GLintptr"/>
+        <param name="drawcount" type="GLintptr"/>
+        <param name="maxdrawcount" type="GLsizei"/>
+        <param name="stride" type="GLsizei"/>
+    </function>
+
+    <function name="MultiDrawElementsIndirectCountARB" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="type" type="GLenum"/>
+        <param name="indirect" type="GLintptr"/>
+        <param name="drawcount" type="GLintptr"/>
+        <param name="maxdrawcount" type="GLsizei"/>
+        <param name="stride" type="GLsizei"/>
+    </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am

index 2da8f7d..900b61a 100644 (file)
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -137,6 +137,7 @@ API_XML = \
         ARB_get_texture_sub_image.xml \
         ARB_gpu_shader_fp64.xml \
         ARB_gpu_shader5.xml \
+       ARB_indirect_parameters.xml \
         ARB_instanced_arrays.xml \
         ARB_internalformat_query.xml \
         ARB_invalidate_subdata.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml

index 21f6293..593ace4 100644 (file)
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8247,7 +8247,11 @@
  
  <xi:include href="ARB_multi_bind.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
  
-<!-- ARB extensions 148 - 159 -->
+<!-- ARB extensions 148 - 153 -->
+
+<xi:include href="ARB_indirect_parameters.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<!-- ARB extensions 155 - 159 -->
  
  <xi:include href="ARB_clip_control.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
  
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c

index 36bed77..1ed0e4d 100644 (file)
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -2986,8 +2986,7 @@ meta_decompress_cleanup(struct gl_context *ctx,
        _mesa_reference_buffer_object(ctx, &decompress->buf_obj, NULL);
     }
  
-   if (decompress->Sampler != 0)
-      _mesa_DeleteSamplers(1, &decompress->Sampler);
+   _mesa_reference_sampler_object(ctx, &decompress->samp_obj, NULL);
  
     memset(decompress, 0, sizeof(*decompress));
  }
@@ -3017,7 +3016,7 @@ decompress_texture_image(struct gl_context *ctx,
     GLenum rbFormat;
     GLenum faceTarget;
     struct vertex verts[4];
-   GLuint samplerSave;
+   struct gl_sampler_object *samp_obj_save = NULL;
     GLenum status;
     const bool use_glsl_version = ctx->Extensions.ARB_vertex_shader &&
                                        ctx->Extensions.ARB_fragment_shader;
@@ -3067,8 +3066,8 @@ decompress_texture_image(struct gl_context *ctx,
     _mesa_meta_begin(ctx, MESA_META_ALL & ~(MESA_META_PIXEL_STORE |
                                             MESA_META_DRAW_BUFFERS));
  
-   samplerSave = ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler ?
-         ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler->Name : 0;
+   _mesa_reference_sampler_object(ctx, &samp_obj_save,
+                                  ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler);
  
     /* Create/bind FBO/renderbuffer */
     if (decompress_fbo->FBO == 0) {
@@ -3113,22 +3112,32 @@ decompress_texture_image(struct gl_context *ctx,
                                         &decompress->buf_obj, 3);
     }
  
-   if (!decompress->Sampler) {
-      _mesa_GenSamplers(1, &decompress->Sampler);
-      _mesa_BindSampler(ctx->Texture.CurrentUnit, decompress->Sampler);
-      /* nearest filtering */
-      _mesa_SamplerParameteri(decompress->Sampler, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-      _mesa_SamplerParameteri(decompress->Sampler, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-      /* No sRGB decode or encode.*/
-      if (ctx->Extensions.EXT_texture_sRGB_decode) {
-         _mesa_SamplerParameteri(decompress->Sampler, GL_TEXTURE_SRGB_DECODE_EXT,
-                             GL_SKIP_DECODE_EXT);
+   if (decompress->samp_obj == NULL) {
+      decompress->samp_obj =  ctx->Driver.NewSamplerObject(ctx, 0xDEADBEEF);
+      if (decompress->samp_obj == NULL) {
+         _mesa_meta_end(ctx);
+
+         /* This is a bit lazy.  Flag out of memory, and then don't bother to
+          * clean up.  Once out of memory is flagged, the only realistic next
+          * move is to destroy the context.  That will trigger all the right
+          * clean up.
+          *
+          * Returning true prevents other GetTexImage methods from attempting
+          * anything since they will likely fail too.
+          */
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
+         return true;
        }
  
-   } else {
-      _mesa_BindSampler(ctx->Texture.CurrentUnit, decompress->Sampler);
+      /* nearest filtering */
+      _mesa_set_sampler_filters(ctx, decompress->samp_obj, GL_NEAREST, GL_NEAREST);
+
+      /* We don't want to encode or decode sRGB values; treat them as linear. */
+      _mesa_set_sampler_srgb_decode(ctx, decompress->samp_obj, GL_SKIP_DECODE_EXT);
     }
  
+   _mesa_bind_sampler(ctx, ctx->Texture.CurrentUnit, decompress->samp_obj);
+
     /* Silence valgrind warnings about reading uninitialized stack. */
     memset(verts, 0, sizeof(verts));
  
@@ -3218,7 +3227,8 @@ decompress_texture_image(struct gl_context *ctx,
     if (!use_glsl_version)
        _mesa_set_enable(ctx, target, GL_FALSE);
  
-   _mesa_BindSampler(ctx->Texture.CurrentUnit, samplerSave);
+   _mesa_bind_sampler(ctx, ctx->Texture.CurrentUnit, samp_obj_save);
+   _mesa_reference_sampler_object(ctx, &samp_obj_save, NULL);
  
     _mesa_meta_end(ctx);
  
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h

index 5b04755..3691e7d 100644 (file)
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -309,7 +309,9 @@ struct blit_state
  struct fb_tex_blit_state
  {
     GLint baseLevelSave, maxLevelSave;
-   GLuint sampler, samplerSave, stencilSamplingSave;
+   struct gl_sampler_object *samp_obj;
+   struct gl_sampler_object *samp_obj_save;
+   GLuint stencilSamplingSave;
     GLuint tempTex;
  };
  
@@ -367,7 +369,7 @@ struct gen_mipmap_state
     GLuint VAO;
     struct gl_buffer_object *buf_obj;
     GLuint FBO;
-   GLuint Sampler;
+   struct gl_sampler_object *samp_obj;
  
     struct blit_shader_table shaders;
  };
@@ -390,7 +392,7 @@ struct decompress_state
     GLuint VAO;
     struct decompress_fbo_state byteFBO, floatFBO;
     struct gl_buffer_object *buf_obj;
-   GLuint Sampler;
+   struct gl_sampler_object *samp_obj;
  
     struct blit_shader_table shaders;
  };
@@ -451,7 +453,7 @@ _mesa_meta_in_progress(struct gl_context *ctx)
  }
  
  extern void
-_mesa_meta_fb_tex_blit_begin(const struct gl_context *ctx,
+_mesa_meta_fb_tex_blit_begin(struct gl_context *ctx,
                               struct fb_tex_blit_state *blit);
  
  extern void
@@ -465,7 +467,7 @@ _mesa_meta_bind_rb_as_tex_image(struct gl_context *ctx,
                                  struct gl_texture_object **texObj,
                                  GLenum *target);
  
-GLuint
+struct gl_sampler_object *
  _mesa_meta_setup_sampler(struct gl_context *ctx,
                           const struct gl_texture_object *texObj,
                           GLenum target, GLenum filter, GLuint srcLevel);
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c

index 4dbf0a7..b414dce 100644 (file)
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -703,8 +703,8 @@ blitframebuffer_texture(struct gl_context *ctx,
       printf("  srcTex %p  dstText %p\n", texObj, drawAtt->Texture);
     */
  
-   fb_tex_blit.sampler = _mesa_meta_setup_sampler(ctx, texObj, target, filter,
-                                                  srcLevel);
+   fb_tex_blit.samp_obj = _mesa_meta_setup_sampler(ctx, texObj, target, filter,
+                                                   srcLevel);
  
     /* Always do our blits with no net sRGB decode or encode.
      *
@@ -725,13 +725,12 @@ blitframebuffer_texture(struct gl_context *ctx,
     if (ctx->Extensions.EXT_texture_sRGB_decode) {
        if (_mesa_get_format_color_encoding(rb->Format) == GL_SRGB &&
            drawFb->Visual.sRGBCapable) {
-         _mesa_SamplerParameteri(fb_tex_blit.sampler,
-                                 GL_TEXTURE_SRGB_DECODE_EXT, GL_DECODE_EXT);
+         _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
+                                       GL_DECODE_EXT);
           _mesa_set_framebuffer_srgb(ctx, GL_TRUE);
        } else {
-         _mesa_SamplerParameteri(fb_tex_blit.sampler,
-                                 GL_TEXTURE_SRGB_DECODE_EXT,
-                                 GL_SKIP_DECODE_EXT);
+         _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
+                                       GL_SKIP_DECODE_EXT);
           /* set_framebuffer_srgb was set by _mesa_meta_begin(). */
        }
     }
@@ -808,12 +807,20 @@ blitframebuffer_texture(struct gl_context *ctx,
  }
  
  void
-_mesa_meta_fb_tex_blit_begin(const struct gl_context *ctx,
+_mesa_meta_fb_tex_blit_begin(struct gl_context *ctx,
                               struct fb_tex_blit_state *blit)
  {
-   blit->samplerSave =
-      ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler ?
-      ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler->Name : 0;
+   /* None of the existing callers preinitialize fb_tex_blit_state to zeros,
+    * and both use stack variables.  If samp_obj_save is not NULL,
+    * _mesa_reference_sampler_object will try to dereference it.  Leaving
+    * random garbage in samp_obj_save can only lead to crashes.
+    *
+    * Since the state isn't persistent across calls, we won't catch ref
+    * counting problems.
+    */
+   blit->samp_obj_save = NULL;
+   _mesa_reference_sampler_object(ctx, &blit->samp_obj_save,
+                                  ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler);
     blit->tempTex = 0;
  }
  
@@ -839,8 +846,10 @@ _mesa_meta_fb_tex_blit_end(struct gl_context *ctx, GLenum target,
        }
     }
  
-   _mesa_BindSampler(ctx->Texture.CurrentUnit, blit->samplerSave);
-   _mesa_DeleteSamplers(1, &blit->sampler);
+   _mesa_bind_sampler(ctx, ctx->Texture.CurrentUnit, blit->samp_obj_save);
+   _mesa_reference_sampler_object(ctx, &blit->samp_obj_save, NULL);
+   _mesa_reference_sampler_object(ctx, &blit->samp_obj, NULL);
+
     if (blit->tempTex)
        _mesa_DeleteTextures(1, &blit->tempTex);
  }
@@ -884,31 +893,33 @@ _mesa_meta_bind_rb_as_tex_image(struct gl_context *ctx,
     return true;
  }
  
-GLuint
+struct gl_sampler_object *
  _mesa_meta_setup_sampler(struct gl_context *ctx,
                           const struct gl_texture_object *texObj,
                           GLenum target, GLenum filter, GLuint srcLevel)
  {
-   GLuint sampler;
+   struct gl_sampler_object *samp_obj;
     GLenum tex_filter = (filter == GL_SCALED_RESOLVE_FASTEST_EXT ||
                          filter == GL_SCALED_RESOLVE_NICEST_EXT) ?
                         GL_NEAREST : filter;
  
-   _mesa_GenSamplers(1, &sampler);
-   _mesa_BindSampler(ctx->Texture.CurrentUnit, sampler);
+   samp_obj =  ctx->Driver.NewSamplerObject(ctx, 0xDEADBEEF);
+   if (samp_obj == NULL)
+      return NULL;
+
+   _mesa_bind_sampler(ctx, ctx->Texture.CurrentUnit, samp_obj);
+   _mesa_set_sampler_filters(ctx, samp_obj, tex_filter, tex_filter);
+   _mesa_set_sampler_wrap(ctx, samp_obj, GL_CLAMP_TO_EDGE, GL_CLAMP_TO_EDGE,
+                          samp_obj->WrapR);
  
     /* Prepare src texture state */
     _mesa_BindTexture(target, texObj->Name);
-   _mesa_SamplerParameteri(sampler, GL_TEXTURE_MIN_FILTER, tex_filter);
-   _mesa_SamplerParameteri(sampler, GL_TEXTURE_MAG_FILTER, tex_filter);
     if (target != GL_TEXTURE_RECTANGLE_ARB) {
        _mesa_TexParameteri(target, GL_TEXTURE_BASE_LEVEL, srcLevel);
        _mesa_TexParameteri(target, GL_TEXTURE_MAX_LEVEL, srcLevel);
     }
-   _mesa_SamplerParameteri(sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-   _mesa_SamplerParameteri(sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
  
-   return sampler;
+   return samp_obj;
  }
  
  /**
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c

index 2b942d6..f20fcac 100644 (file)
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -137,8 +137,7 @@ _mesa_meta_glsl_generate_mipmap_cleanup(struct gl_context *ctx,
     _mesa_DeleteVertexArrays(1, &mipmap->VAO);
     mipmap->VAO = 0;
     _mesa_reference_buffer_object(ctx, &mipmap->buf_obj, NULL);
-   _mesa_DeleteSamplers(1, &mipmap->Sampler);
-   mipmap->Sampler = 0;
+   _mesa_reference_sampler_object(ctx, &mipmap->samp_obj, NULL);
  
     if (mipmap->FBO != 0) {
        _mesa_DeleteFramebuffers(1, &mipmap->FBO);
@@ -182,7 +181,7 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
                                        ctx->Extensions.ARB_fragment_shader;
     GLenum faceTarget;
     GLuint dstLevel;
-   GLuint samplerSave;
+   struct gl_sampler_object *samp_obj_save = NULL;
     GLint swizzle[4];
     GLboolean swizzleSaved = GL_FALSE;
  
@@ -213,8 +212,8 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
        _mesa_set_enable(ctx, target, GL_TRUE);
     }
  
-   samplerSave = ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler ?
-      ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler->Name : 0;
+   _mesa_reference_sampler_object(ctx, &samp_obj_save,
+                                  ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler);
  
     /* We may have been called from glGenerateTextureMipmap with CurrentUnit
      * still set to 0, so we don't know when we can skip binding the texture.
@@ -223,30 +222,29 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
      */
     _mesa_BindTexture(target, texObj->Name);
  
-   if (!mipmap->Sampler) {
-      _mesa_GenSamplers(1, &mipmap->Sampler);
-      _mesa_BindSampler(ctx->Texture.CurrentUnit, mipmap->Sampler);
-
-      _mesa_SamplerParameteri(mipmap->Sampler,
-                              GL_TEXTURE_MIN_FILTER,
-                              GL_LINEAR_MIPMAP_LINEAR);
-      _mesa_SamplerParameteri(mipmap->Sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-      _mesa_SamplerParameteri(mipmap->Sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-      _mesa_SamplerParameteri(mipmap->Sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-      _mesa_SamplerParameteri(mipmap->Sampler, GL_TEXTURE_WRAP_R, GL_CLAMP_TO_EDGE);
-
-      /* We don't want to encode or decode sRGB values; treat them as linear.
-       * This is not technically correct for GLES3 but we don't get any API
-       * error at the moment.
-       */
-      if (ctx->Extensions.EXT_texture_sRGB_decode) {
-         _mesa_SamplerParameteri(mipmap->Sampler, GL_TEXTURE_SRGB_DECODE_EXT,
-               GL_SKIP_DECODE_EXT);
+   if (mipmap->samp_obj == NULL) {
+      mipmap->samp_obj =  ctx->Driver.NewSamplerObject(ctx, 0xDEADBEEF);
+      if (mipmap->samp_obj == NULL) {
+         /* This is a bit lazy.  Flag out of memory, and then don't bother to
+          * clean up.  Once out of memory is flagged, the only realistic next
+          * move is to destroy the context.  That will trigger all the right
+          * clean up.
+          */
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGenerateMipmap");
+         return;
        }
-   } else {
-      _mesa_BindSampler(ctx->Texture.CurrentUnit, mipmap->Sampler);
+
+      _mesa_set_sampler_filters(ctx, mipmap->samp_obj, GL_LINEAR_MIPMAP_LINEAR,
+                                GL_LINEAR);
+      _mesa_set_sampler_wrap(ctx, mipmap->samp_obj, GL_CLAMP_TO_EDGE,
+                             GL_CLAMP_TO_EDGE, GL_CLAMP_TO_EDGE);
+
+      /* We don't want to encode or decode sRGB values; treat them as linear. */
+      _mesa_set_sampler_srgb_decode(ctx, mipmap->samp_obj, GL_SKIP_DECODE_EXT);
     }
  
+   _mesa_bind_sampler(ctx, ctx->Texture.CurrentUnit, mipmap->samp_obj);
+
     assert(mipmap->FBO != 0);
     _mesa_BindFramebuffer(GL_FRAMEBUFFER_EXT, mipmap->FBO);
  
@@ -370,7 +368,8 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
  
     _mesa_lock_texture(ctx, texObj); /* relock */
  
-   _mesa_BindSampler(ctx->Texture.CurrentUnit, samplerSave);
+   _mesa_bind_sampler(ctx, ctx->Texture.CurrentUnit, samp_obj_save);
+   _mesa_reference_sampler_object(ctx, &samp_obj_save, NULL);
  
     _mesa_meta_end(ctx);
  
diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc

index bb840ea..e1874c3 100644 (file)
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -37,18 +37,26 @@ TODO: document the other workarounds.
  
          <application name="Unigine Heaven (32-bit)" executable="heaven_x86">
              <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 4.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
         </application>
  
          <application name="Unigine Heaven (64-bit)" executable="heaven_x64">
              <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 4.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
         </application>
  
          <application name="Unigine Valley (32-bit)" executable="valley_x86">
              <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 1.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
         </application>
  
          <application name="Unigine Valley (64-bit)" executable="valley_x64">
              <option name="allow_glsl_extension_directive_midshader" value="true" />
+            <!-- remove disable_blend_func_extended if 1.1 ever comes out -->
+            <option name="disable_blend_func_extended" value="true" />
         </application>
  
          <application name="Unigine OilRush (32-bit)" executable="OilRush_x86">
diff --git a/src/mesa/drivers/dri/i915/intel_buffer_objects.c b/src/mesa/drivers/dri/i915/intel_buffer_objects.c

index ef06743..e676096 100644 (file)
--- a/src/mesa/drivers/dri/i915/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i915/intel_buffer_objects.c
@@ -99,7 +99,7 @@ intel_bufferobj_free(struct gl_context * ctx, struct gl_buffer_object *obj)
     _mesa_align_free(intel_obj->sys_buffer);
  
     drm_intel_bo_unreference(intel_obj->buffer);
-   free(intel_obj);
+   _mesa_delete_buffer_object(ctx, obj);
  }
  
  
diff --git a/src/mesa/drivers/dri/i915/intel_pixel_copy.c b/src/mesa/drivers/dri/i915/intel_pixel_copy.c

index a718556..213cdbd 100644 (file)
--- a/src/mesa/drivers/dri/i915/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_copy.c
@@ -138,7 +138,7 @@ do_blit_copypixels(struct gl_context * ctx,
     }
  
     if (ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F) {
-      perf_debug("glCopyPixles(): Unsupported pixel zoom\n");
+      perf_debug("glCopyPixels(): Unsupported pixel zoom\n");
        return false;
     }
  
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources

index d147a73..8677743 100644 (file)
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -2,7 +2,6 @@ i965_compiler_FILES = \
         brw_cfg.cpp \
         brw_cfg.h \
         brw_compiler.h \
-       brw_cubemap_normalize.cpp \
         brw_dead_control_flow.cpp \
         brw_dead_control_flow.h \
         brw_defines.h \
@@ -16,7 +15,6 @@ i965_compiler_FILES = \
         brw_eu_util.c \
         brw_eu_validate.c \
         brw_fs_builder.h \
-       brw_fs_channel_expressions.cpp \
         brw_fs_cmod_propagation.cpp \
         brw_fs_combine_constants.cpp \
         brw_fs_copy_propagation.cpp \
@@ -35,15 +33,12 @@ i965_compiler_FILES = \
         brw_fs_surface_builder.cpp \
         brw_fs_surface_builder.h \
         brw_fs_validate.cpp \
-       brw_fs_vector_splitting.cpp \
         brw_fs_visitor.cpp \
         brw_inst.h \
         brw_interpolation_map.c \
         brw_ir_allocator.h \
         brw_ir_fs.h \
         brw_ir_vec4.h \
-       brw_lower_texture_gradients.cpp \
-       brw_lower_unnormalized_offset.cpp \
         brw_nir.h \
         brw_nir.c \
         brw_nir_analyze_boolean_resolves.c \
@@ -56,6 +51,7 @@ i965_compiler_FILES = \
         brw_shader.cpp \
         brw_shader.h \
         brw_surface_formats.c \
+       brw_surface_formats.h \
         brw_util.c \
         brw_util.h \
         brw_vec4_builder.h \
@@ -75,6 +71,8 @@ i965_compiler_FILES = \
         brw_vec4_reg_allocate.cpp \
         brw_vec4_surface_builder.cpp \
         brw_vec4_surface_builder.h \
+       brw_vec4_tcs.cpp \
+       brw_vec4_tes.cpp \
         brw_vec4_visitor.cpp \
         brw_vec4_vs_visitor.cpp \
         brw_vue_map.c \
@@ -112,6 +110,7 @@ i965_FILES = \
         brw_context.h \
         brw_cs.c \
         brw_cs.h \
+       brw_cubemap_normalize.cpp \
         brw_curbe.c \
         brw_draw.c \
         brw_draw.h \
@@ -119,11 +118,15 @@ i965_FILES = \
         brw_ff_gs.c \
         brw_ff_gs_emit.c \
         brw_ff_gs.h \
+       brw_fs_channel_expressions.cpp \
+       brw_fs_vector_splitting.cpp \
         brw_gs.c \
         brw_gs.h \
         brw_gs_state.c \
         brw_gs_surface_state.c \
         brw_link.cpp \
+       brw_lower_texture_gradients.cpp \
+       brw_lower_unnormalized_offset.cpp \
         brw_meta_fast_clear.c \
         brw_meta_stencil_blit.c \
         brw_meta_updownsample.c \
@@ -150,7 +153,9 @@ i965_FILES = \
         brw_state.h \
         brw_state_upload.c \
         brw_structs.h \
+       brw_tcs.c \
         brw_tcs_surface_state.c \
+       brw_tes.c \
         brw_tes_surface_state.c \
         brw_tex.c \
         brw_tex_layout.c \
diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c

index 80935cf..7fa5d60 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -196,12 +196,12 @@ const struct brw_tracked_state brw_wm_binding_table = {
     .emit = brw_upload_wm_binding_table,
  };
  
-/** Upload the TCS binding table (if TCS is active). */
+/** Upload the TCS binding table (if tessellation stages are active). */
  static void
  brw_tcs_upload_binding_table(struct brw_context *brw)
  {
-   /* If there's no TCS, skip changing anything. */
-   if (brw->tess_ctrl_program == NULL)
+   /* Skip if the tessellation stages are disabled. */
+   if (brw->tess_eval_program == NULL)
        return;
  
     /* BRW_NEW_TCS_PROG_DATA */
@@ -216,6 +216,7 @@ const struct brw_tracked_state brw_tcs_binding_table = {
     .dirty = {
        .mesa = 0,
        .brw = BRW_NEW_BATCH |
+             BRW_NEW_DEFAULT_TESS_LEVELS |
               BRW_NEW_SURFACES |
               BRW_NEW_TCS_CONSTBUF |
               BRW_NEW_TCS_PROG_DATA,
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp

index e684bdb..fd23e23 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -32,7 +32,7 @@ brw_blorp_eu_emitter::brw_blorp_eu_emitter(struct brw_context *brw,
       generator(brw->intelScreen->compiler, brw,
                 mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key),
                 (struct brw_stage_prog_data *) rzalloc(mem_ctx, struct brw_wm_prog_data),
-               0, false, "BLORP")
+               0, false, MESA_SHADER_FRAGMENT)
  {
     if (debug_flag)
        generator.enable_debug("blorp");
@@ -85,7 +85,7 @@ brw_blorp_eu_emitter::emit_texture_lookup(const struct brw_reg &dst,
                                            unsigned msg_length)
  {
     fs_inst *inst = new (mem_ctx) fs_inst(op, 16, dst, brw_message_reg(base_mrf),
-                                         brw_imm_ud(0u));
+                                         brw_imm_ud(0u), brw_imm_ud(0u));
  
     inst->base_mrf = base_mrf;
     inst->mlen = msg_length;
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h

index 514788c..b66869b 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -94,6 +94,9 @@ struct brw_compiler {
     struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
  };
  
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
+
  
  /**
   * Program key structures.
@@ -191,6 +194,38 @@ struct brw_vs_prog_key {
     struct brw_sampler_prog_key_data tex;
  };
  
+/** The program key for Tessellation Control Shaders. */
+struct brw_tcs_prog_key
+{
+   unsigned program_string_id;
+
+   GLenum tes_primitive_mode;
+
+   unsigned input_vertices;
+
+   /** A bitfield of per-patch outputs written. */
+   uint32_t patch_outputs_written;
+
+   /** A bitfield of per-vertex outputs written. */
+   uint64_t outputs_written;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Tessellation Evaluation Shaders. */
+struct brw_tes_prog_key
+{
+   unsigned program_string_id;
+
+   /** A bitfield of per-patch inputs read. */
+   uint32_t patch_inputs_read;
+
+   /** A bitfield of per-vertex inputs read. */
+   uint64_t inputs_read;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
  /** The program key for Geometry Shaders. */
  struct brw_gs_prog_key
  {
@@ -442,7 +477,7 @@ struct brw_vue_map {
      * additional processing is applied before storing them in the VUE), the
      * value is -1.
      */
-   signed char varying_to_slot[BRW_VARYING_SLOT_COUNT];
+   signed char varying_to_slot[VARYING_SLOT_TESS_MAX];
  
     /**
      * Map from VUE slot to gl_varying_slot value.  For slots that do not
@@ -451,12 +486,24 @@ struct brw_vue_map {
      *
      * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
      */
-   signed char slot_to_varying[BRW_VARYING_SLOT_COUNT];
+   signed char slot_to_varying[VARYING_SLOT_TESS_MAX];
  
     /**
      * Total number of VUE slots in use
      */
     int num_slots;
+
+   /**
+    * Number of per-patch VUE slots. Only valid for tessellation control
+    * shader outputs and tessellation evaluation shader inputs.
+    */
+   int num_per_patch_slots;
+
+   /**
+    * Number of per-vertex VUE slots. Only valid for tessellation control
+    * shader outputs and tessellation evaluation shader inputs.
+    */
+   int num_per_vertex_slots;
  };
  
  void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map);
@@ -484,6 +531,10 @@ void brw_compute_vue_map(const struct brw_device_info *devinfo,
                           GLbitfield64 slots_valid,
                           bool separate_shader);
  
+void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map,
+                              const GLbitfield64 slots_valid,
+                              const GLbitfield is_patch);
+
  enum shader_dispatch_mode {
     DISPATCH_MODE_4X1_SINGLE = 0,
     DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
@@ -547,6 +598,9 @@ struct brw_vs_prog_data {
  
     bool uses_vertexid;
     bool uses_instanceid;
+   bool uses_basevertex;
+   bool uses_baseinstance;
+   bool uses_drawid;
  };
  
  struct brw_tcs_prog_data
@@ -653,6 +707,38 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
                 char **error_str);
  
  /**
+ * Compile a tessellation control shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tcs_prog_key *key,
+                struct brw_tcs_prog_data *prog_data,
+                const struct nir_shader *nir,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str);
+
+/**
+ * Compile a tessellation evaluation shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler, void *log_data,
+                void *mem_ctx,
+                const struct brw_tes_prog_key *key,
+                struct brw_tes_prog_data *prog_data,
+                const struct nir_shader *shader,
+                struct gl_shader_program *shader_prog,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str);
+
+/**
   * Compile a vertex shader.
   *
   * Returns the final assembly and the program's size.
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c

index 0abe601..9ba3339 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -159,8 +159,10 @@ intel_viewport(struct gl_context *ctx)
     __DRIcontext *driContext = brw->driContext;
  
     if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
-      dri2InvalidateDrawable(driContext->driDrawablePriv);
-      dri2InvalidateDrawable(driContext->driReadablePriv);
+      if (driContext->driDrawablePriv)
+         dri2InvalidateDrawable(driContext->driDrawablePriv);
+      if (driContext->driReadablePriv)
+         dri2InvalidateDrawable(driContext->driReadablePriv);
     }
  }
  
@@ -372,12 +374,15 @@ brw_initialize_context_constants(struct brw_context *brw)
  
     const bool stage_exists[MESA_SHADER_STAGES] = {
        [MESA_SHADER_VERTEX] = true,
-      [MESA_SHADER_TESS_CTRL] = brw->gen >= 8,
-      [MESA_SHADER_TESS_EVAL] = brw->gen >= 8,
+      [MESA_SHADER_TESS_CTRL] = brw->gen >= 7,
+      [MESA_SHADER_TESS_EVAL] = brw->gen >= 7,
        [MESA_SHADER_GEOMETRY] = brw->gen >= 6,
        [MESA_SHADER_FRAGMENT] = true,
        [MESA_SHADER_COMPUTE] =
-         (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) ||
+         (ctx->API == API_OPENGL_CORE &&
+          ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) ||
+         (ctx->API == API_OPENGLES2 &&
+          ctx->Const.MaxComputeWorkGroupSize[0] >= 128) ||
           _mesa_extension_override_enables.ARB_compute_shader,
     };
  
@@ -807,7 +812,7 @@ brwCreateContext(gl_api api,
     brw->needs_unlit_centroid_workaround =
        devinfo->needs_unlit_centroid_workaround;
  
-   brw->must_use_separate_stencil = screen->hw_must_use_separate_stencil;
+   brw->must_use_separate_stencil = devinfo->must_use_separate_stencil;
     brw->has_swizzling = screen->hw_has_swizzling;
  
     brw->vs.base.stage = MESA_SHADER_VERTEX;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h

index 1cc4c7b..7b0340f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -179,8 +179,7 @@ enum brw_state_id {
     BRW_STATE_URB_FENCE = BRW_MAX_CACHE,
     BRW_STATE_FRAGMENT_PROGRAM,
     BRW_STATE_GEOMETRY_PROGRAM,
-   BRW_STATE_TESS_CTRL_PROGRAM,
-   BRW_STATE_TESS_EVAL_PROGRAM,
+   BRW_STATE_TESS_PROGRAMS,
     BRW_STATE_VERTEX_PROGRAM,
     BRW_STATE_CURBE_OFFSETS,
     BRW_STATE_REDUCED_PRIMITIVE,
@@ -192,6 +191,7 @@ enum brw_state_id {
     BRW_STATE_BINDING_TABLE_POINTERS,
     BRW_STATE_INDICES,
     BRW_STATE_VERTICES,
+   BRW_STATE_DEFAULT_TESS_LEVELS,
     BRW_STATE_BATCH,
     BRW_STATE_INDEX_BUFFER,
     BRW_STATE_VS_CONSTBUF,
@@ -262,8 +262,7 @@ enum brw_state_id {
  #define BRW_NEW_URB_FENCE               (1ull << BRW_STATE_URB_FENCE)
  #define BRW_NEW_FRAGMENT_PROGRAM        (1ull << BRW_STATE_FRAGMENT_PROGRAM)
  #define BRW_NEW_GEOMETRY_PROGRAM        (1ull << BRW_STATE_GEOMETRY_PROGRAM)
-#define BRW_NEW_TESS_EVAL_PROGRAM       (1ull << BRW_STATE_TESS_EVAL_PROGRAM)
-#define BRW_NEW_TESS_CTRL_PROGRAM       (1ull << BRW_STATE_TESS_CTRL_PROGRAM)
+#define BRW_NEW_TESS_PROGRAMS           (1ull << BRW_STATE_TESS_PROGRAMS)
  #define BRW_NEW_VERTEX_PROGRAM          (1ull << BRW_STATE_VERTEX_PROGRAM)
  #define BRW_NEW_CURBE_OFFSETS           (1ull << BRW_STATE_CURBE_OFFSETS)
  #define BRW_NEW_REDUCED_PRIMITIVE       (1ull << BRW_STATE_REDUCED_PRIMITIVE)
@@ -275,6 +274,7 @@ enum brw_state_id {
  #define BRW_NEW_BINDING_TABLE_POINTERS  (1ull << BRW_STATE_BINDING_TABLE_POINTERS)
  #define BRW_NEW_INDICES                 (1ull << BRW_STATE_INDICES)
  #define BRW_NEW_VERTICES                (1ull << BRW_STATE_VERTICES)
+#define BRW_NEW_DEFAULT_TESS_LEVELS     (1ull << BRW_STATE_DEFAULT_TESS_LEVELS)
  /**
   * Used for any batch entry with a relocated pointer that will be used
   * by any 3D rendering.
@@ -909,8 +909,13 @@ struct brw_context
     uint32_t pma_stall_bits;
  
     struct {
-      /** The value of gl_BaseVertex for the current _mesa_prim. */
-      int gl_basevertex;
+      struct {
+         /** The value of gl_BaseVertex for the current _mesa_prim. */
+         int gl_basevertex;
+
+         /** The value of gl_BaseInstance for the current _mesa_prim. */
+         int gl_baseinstance;
+      } params;
  
        /**
         * Buffer and offset used for GL_ARB_shader_draw_parameters
@@ -918,6 +923,15 @@ struct brw_context
         */
        drm_intel_bo *draw_params_bo;
        uint32_t draw_params_offset;
+
+      /**
+       * The value of gl_DrawID for the current _mesa_prim. This always comes
+       * in from it's own vertex buffer since it's not part of the indirect
+       * draw parameters.
+       */
+      int gl_drawid;
+      drm_intel_bo *draw_id_bo;
+      uint32_t draw_id_offset;
     } draw;
  
     struct {
@@ -1008,6 +1022,8 @@ struct brw_context
     struct {
        GLuint vsize;            /* vertex size plus header in urb registers */
        GLuint gsize;            /* GS output size in urb registers */
+      GLuint hsize;             /* Tessellation control output size in urb registers */
+      GLuint dsize;             /* Tessellation evaluation output size in urb registers */
        GLuint csize;            /* constant buffer size in urb registers */
        GLuint sfsize;           /* setup data size in urb registers */
  
@@ -1020,12 +1036,16 @@ struct brw_context
        GLuint max_gs_entries;   /* Maximum number of GS entries */
  
        GLuint nr_vs_entries;
+      GLuint nr_hs_entries;
+      GLuint nr_ds_entries;
        GLuint nr_gs_entries;
        GLuint nr_clip_entries;
        GLuint nr_sf_entries;
        GLuint nr_cs_entries;
  
        GLuint vs_start;
+      GLuint hs_start;
+      GLuint ds_start;
        GLuint gs_start;
        GLuint clip_start;
        GLuint sf_start;
@@ -1042,6 +1062,11 @@ struct brw_context
         * URB space for the GS.
         */
        bool gs_present;
+
+      /* True if the most recently sent _3DSTATE_URB message allocated
+       * URB space for the HS and DS.
+       */
+      bool tess_present;
     } urb;
  
  
@@ -1648,12 +1673,18 @@ void gen8_emit_3dstate_sample_pattern(struct brw_context *brw);
  /* gen7_urb.c */
  void
  gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
+                              unsigned hs_size, unsigned ds_size,
                                unsigned gs_size, unsigned fs_size);
  
  void
  gen7_emit_urb_state(struct brw_context *brw,
-                    unsigned nr_vs_entries, unsigned vs_size,
-                    unsigned vs_start, unsigned nr_gs_entries,
+                    unsigned nr_vs_entries,
+                    unsigned vs_size, unsigned vs_start,
+                    unsigned nr_hs_entries,
+                    unsigned hs_size, unsigned hs_start,
+                    unsigned nr_ds_entries,
+                    unsigned ds_size, unsigned ds_start,
+                    unsigned nr_gs_entries,
                      unsigned gs_size, unsigned gs_start);
  
  
@@ -1687,6 +1718,18 @@ brw_vertex_program_const(const struct gl_vertex_program *p)
     return (const struct brw_vertex_program *) p;
  }
  
+static inline struct brw_tess_ctrl_program *
+brw_tess_ctrl_program(struct gl_tess_ctrl_program *p)
+{
+   return (struct brw_tess_ctrl_program *) p;
+}
+
+static inline struct brw_tess_eval_program *
+brw_tess_eval_program(struct gl_tess_eval_program *p)
+{
+   return (struct brw_tess_eval_program *) p;
+}
+
  static inline struct brw_geometry_program *
  brw_geometry_program(struct gl_geometry_program *p)
  {
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h

index 4a184cf..df7a79f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -63,6 +63,7 @@
  # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL (0 << 8)
  # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     (1 << 8)
  
+#ifndef _3DPRIM_POINTLIST /* FIXME: Avoid clashing with defines from bdw_pack.h */
  #define _3DPRIM_POINTLIST         0x01
  #define _3DPRIM_LINELIST          0x02
  #define _3DPRIM_LINESTRIP         0x03
@@ -86,6 +87,7 @@
  #define _3DPRIM_TRIFAN_NOSTIPPLE  0x16
  #define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
  
+#endif /* bdw_pack.h */
  
  /* We use this offset to be able to pass native primitive types in struct
   * _mesa_prim::mode.  Native primitive types are BRW_PRIM_OFFSET +
@@ -1305,6 +1307,21 @@ enum opcode {
      *           UD immediate).
      */
     SHADER_OPCODE_MOV_INDIRECT,
+
+   VEC4_OPCODE_URB_READ,
+   TCS_OPCODE_GET_INSTANCE_ID,
+   TCS_OPCODE_URB_WRITE,
+   TCS_OPCODE_SET_INPUT_URB_OFFSETS,
+   TCS_OPCODE_SET_OUTPUT_URB_OFFSETS,
+   TCS_OPCODE_GET_PRIMITIVE_ID,
+   TCS_OPCODE_CREATE_BARRIER_HEADER,
+   TCS_OPCODE_SRC0_010_IS_ZERO,
+   TCS_OPCODE_RELEASE_INPUT,
+   TCS_OPCODE_THREAD_END,
+
+   TES_OPCODE_GET_PRIMITIVE_ID,
+   TES_OPCODE_CREATE_INPUT_READ_HEADER,
+   TES_OPCODE_ADD_INDIRECT_URB_OFFSET,
  };
  
  enum brw_urb_write_flags {
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c

index 4bfc831..e8af70c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -402,6 +402,71 @@ static const struct brw_device_info brw_device_info_bxt = {
     }
  };
  
+/*
+ * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
+ * There's no KBL entry. Using the default SKL (GEN9) GS entries value.
+ */
+
+/*
+ * Both SKL and KBL support a maximum of 64 threads per
+ * Pixel Shader Dispatch (PSD) unit.
+ */
+#define  KBL_MAX_THREADS_PER_PSD 64
+
+static const struct brw_device_info brw_device_info_kbl_gt1 = {
+   GEN9_FEATURES,
+   .gt = 1,
+
+   .max_cs_threads = 7 * 6,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2,
+   .urb.size = 192,
+   .num_slices = 1,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt1_5 = {
+   GEN9_FEATURES,
+   .gt = 1,
+
+   .max_cs_threads = 7 * 6,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
+   .num_slices = 1,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt2 = {
+   GEN9_FEATURES,
+   .gt = 2,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
+   .num_slices = 1,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt3 = {
+   GEN9_FEATURES,
+   .gt = 3,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6,
+   .num_slices = 2,
+};
+
+static const struct brw_device_info brw_device_info_kbl_gt4 = {
+   GEN9_FEATURES,
+   .gt = 4,
+
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9,
+   /*
+    * From the "L3 Allocation and Programming" documentation:
+    *
+    * "URB is limited to 1008KB due to programming restrictions.  This
+    *  is not a restriction of the L3 implementation, but of the FF and
+    *  other clients.  Therefore, in a GT4 implementation it is
+    *  possible for the programmed allocation of the L3 data array to
+    *  provide 3*384KB=1152KB for URB, but only 1008KB of this
+    *  will be used."
+    */
+   .urb.size = 1008 / 3,
+   .num_slices = 3,
+};
+
  const struct brw_device_info *
  brw_get_device_info(int devid)
  {
@@ -418,3 +483,15 @@ brw_get_device_info(int devid)
  
     return devinfo;
  }
+
+const char *
+brw_get_device_name(int devid)
+{
+   switch (devid) {
+#undef CHIPSET
+#define CHIPSET(id, family, name) case id: return name;
+#include "pci_ids/i965_pci_ids.h"
+   default:
+      return NULL;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h

index 73d6820..48e0dee 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -98,3 +98,4 @@ struct brw_device_info
  };
  
  const struct brw_device_info *brw_get_device_info(int devid);
+const char *brw_get_device_name(int devid);
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c

index 8398471..23e71fd 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -54,23 +54,6 @@
  
  #define FILE_DEBUG_FLAG DEBUG_PRIMS
  
-static const GLuint prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = {
-   [GL_POINTS] =_3DPRIM_POINTLIST,
-   [GL_LINES] = _3DPRIM_LINELIST,
-   [GL_LINE_LOOP] = _3DPRIM_LINELOOP,
-   [GL_LINE_STRIP] = _3DPRIM_LINESTRIP,
-   [GL_TRIANGLES] = _3DPRIM_TRILIST,
-   [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
-   [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
-   [GL_QUADS] = _3DPRIM_QUADLIST,
-   [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
-   [GL_POLYGON] = _3DPRIM_POLYGON,
-   [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
-   [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
-   [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
-   [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
-};
-
  
  static const GLenum reduced_prim[GL_POLYGON+1] = {
     [GL_POINTS] = GL_POINTS,
@@ -85,18 +68,6 @@ static const GLenum reduced_prim[GL_POLYGON+1] = {
     [GL_POLYGON] = GL_TRIANGLES
  };
  
-uint32_t
-get_hw_prim_for_gl_prim(int mode)
-{
-   if (mode >= BRW_PRIM_OFFSET)
-      return mode - BRW_PRIM_OFFSET;
-   else {
-      assert(mode < ARRAY_SIZE(prim_to_hw_prim));
-      return prim_to_hw_prim[mode];
-   }
-}
-
-
  /* When the primitive changes, set a state bit and re-validate.  Not
   * the nicest and would rather deal with this by having all the
   * programs be immune to the active primitive (ie. cope with all
@@ -491,9 +462,29 @@ brw_try_draw_prims(struct gl_context *ctx,
           }
        }
  
-      brw->draw.gl_basevertex =
+      /* Determine if we need to flag BRW_NEW_VERTICES for updating the
+       * gl_BaseVertexARB or gl_BaseInstanceARB values. For indirect draw, we
+       * always flag if the shader uses one of the values. For direct draws,
+       * we only flag if the values change.
+       */
+      const int new_basevertex =
           prims[i].indexed ? prims[i].basevertex : prims[i].start;
+      const int new_baseinstance = prims[i].base_instance;
+      if (i > 0) {
+         const bool uses_draw_parameters =
+            brw->vs.prog_data->uses_basevertex ||
+            brw->vs.prog_data->uses_baseinstance;
+
+         if ((uses_draw_parameters && prims[i].is_indirect) ||
+             (brw->vs.prog_data->uses_basevertex &&
+              brw->draw.params.gl_basevertex != new_basevertex) ||
+             (brw->vs.prog_data->uses_baseinstance &&
+              brw->draw.params.gl_baseinstance != new_baseinstance))
+            brw->ctx.NewDriverState |= BRW_NEW_VERTICES;
+      }
  
+      brw->draw.params.gl_basevertex = new_basevertex;
+      brw->draw.params.gl_baseinstance = new_baseinstance;
        drm_intel_bo_unreference(brw->draw.draw_params_bo);
  
        if (prims[i].is_indirect) {
@@ -511,6 +502,18 @@ brw_try_draw_prims(struct gl_context *ctx,
           brw->draw.draw_params_offset = 0;
        }
  
+      /* gl_DrawID always needs its own vertex buffer since it's not part of
+       * the indirect parameter buffer. If the program uses gl_DrawID we need
+       * to flag BRW_NEW_VERTICES. For the first iteration, we don't have
+       * valid brw->vs.prog_data, but we always flag BRW_NEW_VERTICES before
+       * the loop.
+       */
+      brw->draw.gl_drawid = prims[i].draw_id;
+      drm_intel_bo_unreference(brw->draw.draw_id_bo);
+      brw->draw.draw_id_bo = NULL;
+      if (i > 0 && brw->vs.prog_data->uses_drawid)
+         brw->ctx.NewDriverState |= BRW_NEW_VERTICES;
+
        if (brw->gen < 6)
          brw_set_prim(brw, &prims[i]);
        else
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c

index ea0f6f2..f781d8b 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -592,11 +592,19 @@ void
  brw_prepare_shader_draw_parameters(struct brw_context *brw)
  {
     /* For non-indirect draws, upload gl_BaseVertex. */
-   if (brw->vs.prog_data->uses_vertexid && brw->draw.draw_params_bo == NULL) {
-      intel_upload_data(brw, &brw->draw.gl_basevertex, 4, 4,
+   if ((brw->vs.prog_data->uses_basevertex ||
+        brw->vs.prog_data->uses_baseinstance) &&
+       brw->draw.draw_params_bo == NULL) {
+      intel_upload_data(brw, &brw->draw.params, sizeof(brw->draw.params), 4,
                         &brw->draw.draw_params_bo,
                          &brw->draw.draw_params_offset);
     }
+
+   if (brw->vs.prog_data->uses_drawid) {
+      intel_upload_data(brw, &brw->draw.gl_drawid, sizeof(brw->draw.gl_drawid), 4,
+                        &brw->draw.draw_id_bo,
+                        &brw->draw.draw_id_offset);
+   }
  }
  
  /**
@@ -658,8 +666,11 @@ brw_emit_vertices(struct brw_context *brw)
     brw_emit_query_begin(brw);
  
     unsigned nr_elements = brw->vb.nr_enabled;
-   if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid)
+   if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid ||
+       brw->vs.prog_data->uses_basevertex || brw->vs.prog_data->uses_baseinstance)
        ++nr_elements;
+   if (brw->vs.prog_data->uses_drawid)
+      nr_elements++;
  
     /* If the VS doesn't read any inputs (calculating vertex position from
      * a state variable for some reason, for example), emit a single pad
@@ -693,8 +704,11 @@ brw_emit_vertices(struct brw_context *brw)
     /* Now emit VB and VEP state packets.
      */
  
-   unsigned nr_buffers =
-      brw->vb.nr_buffers + brw->vs.prog_data->uses_vertexid;
+   const bool uses_draw_params =
+      brw->vs.prog_data->uses_basevertex ||
+      brw->vs.prog_data->uses_baseinstance;
+   const unsigned nr_buffers = brw->vb.nr_buffers +
+      uses_draw_params + brw->vs.prog_data->uses_drawid;
  
     if (nr_buffers) {
        if (brw->gen >= 6) {
@@ -713,7 +727,7 @@ brw_emit_vertices(struct brw_context *brw)
  
        }
  
-      if (brw->vs.prog_data->uses_vertexid) {
+      if (uses_draw_params) {
           EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
                                    brw->draw.draw_params_bo,
                                    brw->draw.draw_params_bo->size - 1,
@@ -721,6 +735,16 @@ brw_emit_vertices(struct brw_context *brw)
                                    0,  /* stride */
                                    0); /* step rate */
        }
+
+      if (brw->vs.prog_data->uses_drawid) {
+         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers + 1,
+                                  brw->draw.draw_id_bo,
+                                  brw->draw.draw_id_bo->size - 1,
+                                  brw->draw.draw_id_offset,
+                                  0,  /* stride */
+                                  0); /* step rate */
+      }
+
        ADVANCE_BATCH();
     }
  
@@ -790,21 +814,25 @@ brw_emit_vertices(struct brw_context *brw)
                      ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
     }
  
-   if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid) {
+   if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid ||
+       brw->vs.prog_data->uses_basevertex || brw->vs.prog_data->uses_baseinstance) {
        uint32_t dw0 = 0, dw1 = 0;
        uint32_t comp0 = BRW_VE1_COMPONENT_STORE_0;
        uint32_t comp1 = BRW_VE1_COMPONENT_STORE_0;
        uint32_t comp2 = BRW_VE1_COMPONENT_STORE_0;
        uint32_t comp3 = BRW_VE1_COMPONENT_STORE_0;
  
-      if (brw->vs.prog_data->uses_vertexid) {
+      if (brw->vs.prog_data->uses_basevertex)
           comp0 = BRW_VE1_COMPONENT_STORE_SRC;
+
+      if (brw->vs.prog_data->uses_baseinstance)
+         comp1 = BRW_VE1_COMPONENT_STORE_SRC;
+
+      if (brw->vs.prog_data->uses_vertexid)
           comp2 = BRW_VE1_COMPONENT_STORE_VID;
-      }
  
-      if (brw->vs.prog_data->uses_instanceid) {
+      if (brw->vs.prog_data->uses_instanceid)
           comp3 = BRW_VE1_COMPONENT_STORE_IID;
-      }
  
        dw1 = (comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
              (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
@@ -814,11 +842,11 @@ brw_emit_vertices(struct brw_context *brw)
        if (brw->gen >= 6) {
           dw0 |= GEN6_VE0_VALID |
                  brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
-                BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT;
+                BRW_SURFACEFORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT;
        } else {
           dw0 |= BRW_VE0_VALID |
                  brw->vb.nr_buffers << BRW_VE0_INDEX_SHIFT |
-                BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT;
+                BRW_SURFACEFORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT;
          dw1 |= (i * 4) << BRW_VE1_DST_OFFSET_SHIFT;
        }
  
@@ -830,6 +858,30 @@ brw_emit_vertices(struct brw_context *brw)
        OUT_BATCH(dw1);
     }
  
+   if (brw->vs.prog_data->uses_drawid) {
+      uint32_t dw0 = 0, dw1 = 0;
+
+      dw1 = (BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
+            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_1_SHIFT) |
+            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_2_SHIFT) |
+            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_3_SHIFT);
+
+      if (brw->gen >= 6) {
+         dw0 |= GEN6_VE0_VALID |
+                ((brw->vb.nr_buffers + 1) << GEN6_VE0_INDEX_SHIFT) |
+                (BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
+      } else {
+         dw0 |= BRW_VE0_VALID |
+                ((brw->vb.nr_buffers + 1) << BRW_VE0_INDEX_SHIFT) |
+                (BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
+
+        dw1 |= (i * 4) << BRW_VE1_DST_OFFSET_SHIFT;
+      }
+
+      OUT_BATCH(dw0);
+      OUT_BATCH(dw1);
+   }
+
     if (brw->gen >= 6 && gen6_edgeflag_input) {
        uint32_t format =
           brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c

index 5fb9662..35d8039 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -847,12 +847,11 @@ brw_alu2(struct brw_codegen *p, unsigned opcode,
  static int
  get_3src_subreg_nr(struct brw_reg reg)
  {
-   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
-      assert(brw_is_single_value_swizzle(reg.swizzle));
-      return reg.subnr / 4 + BRW_GET_SWZ(reg.swizzle, 0);
-   } else {
-      return reg.subnr / 4;
-   }
+   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
+    * use 32-bit units (components 0..7).  Since they only support F/D/UD
+    * types, this doesn't lose any flexibility, but uses fewer bits.
+    */
+   return reg.subnr / 4;
  }
  
  static brw_inst *
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 6c4a70f..fc883f4 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -508,6 +508,7 @@ type_size_scalar(const struct glsl_type *type)
     case GLSL_TYPE_ERROR:
     case GLSL_TYPE_INTERFACE:
     case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_FUNCTION:
        unreachable("not reached");
     }
  
@@ -737,15 +738,15 @@ fs_inst::components_read(unsigned i) const
     case SHADER_OPCODE_LOD_LOGICAL:
     case SHADER_OPCODE_TG4_LOGICAL:
     case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
-      assert(src[8].file == IMM && src[9].file == IMM);
+      assert(src[9].file == IMM && src[10].file == IMM);
        /* Texture coordinates. */
        if (i == 0)
-         return src[8].ud;
+         return src[9].ud;
        /* Texture derivatives. */
        else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
-         return src[9].ud;
+         return src[10].ud;
        /* Texture offset. */
-      else if (i == 7)
+      else if (i == 8)
           return 2;
        /* MCS */
        else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
@@ -1671,7 +1672,10 @@ fs_visitor::assign_vs_urb_setup()
  
     assert(stage == MESA_SHADER_VERTEX);
     int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
-   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
+   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
+       vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
+      count++;
+   if (vs_prog_data->uses_drawid)
        count++;
  
     /* Each attribute is 4 regs. */
@@ -1686,6 +1690,21 @@ fs_visitor::assign_vs_urb_setup()
  }
  
  void
+fs_visitor::assign_tes_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
  fs_visitor::assign_gs_urb_setup()
  {
     assert(stage == MESA_SHADER_GEOMETRY);
@@ -2814,10 +2833,23 @@ fs_visitor::emit_repclear_shader()
     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
     int base_mrf = 1;
     int color_mrf = base_mrf + 2;
+   fs_inst *mov;
  
-   fs_inst *mov = bld.exec_all().group(4, 0)
-                     .MOV(brw_message_reg(color_mrf),
-                          fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   if (uniforms == 1) {
+      mov = bld.exec_all().group(4, 0)
+               .MOV(brw_message_reg(color_mrf),
+                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   } else {
+      struct brw_reg reg =
+         brw_reg(BRW_GENERAL_REGISTER_FILE,
+                 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+                 BRW_VERTICAL_STRIDE_8,
+                 BRW_WIDTH_2,
+                 BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+      mov = bld.exec_all().group(4, 0)
+               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+   }
  
     fs_inst *write;
     if (key->nr_color_regions == 1) {
@@ -2846,8 +2878,10 @@ fs_visitor::emit_repclear_shader()
     assign_curb_setup();
  
     /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
-   assert(mov->src[0].file == FIXED_GRF);
-   mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   if (uniforms == 1) {
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   }
  }
  
  /**
@@ -3455,8 +3489,7 @@ fs_visitor::lower_integer_multiplication()
               */
              assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
                     mul->src[1].type == BRW_REGISTER_TYPE_UD);
-            mul->src[1].type = (type_is_signed(mul->src[1].type) ?
-                                BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+            mul->src[1].type = BRW_REGISTER_TYPE_UW;
              mul->src[1].stride *= 2;
  
           } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
@@ -3674,6 +3707,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &coordinate,
                                  const fs_reg &shadow_c,
                                  const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &surface,
                                  const fs_reg &sampler,
                                  unsigned coord_components,
                                  unsigned grad_components)
@@ -3766,8 +3800,9 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
  
     inst->opcode = op;
     inst->src[0] = reg_undef;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
     inst->base_mrf = msg_begin.nr;
     inst->mlen = msg_end.nr - msg_begin.nr;
     inst->header_size = 1;
@@ -3779,6 +3814,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &shadow_c,
                                  fs_reg lod, fs_reg lod2,
                                  const fs_reg &sample_index,
+                                const fs_reg &surface,
                                  const fs_reg &sampler,
                                  const fs_reg &offset_value,
                                  unsigned coord_components,
@@ -3861,8 +3897,9 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
  
     inst->opcode = op;
     inst->src[0] = reg_undef;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
     inst->base_mrf = message.nr;
     inst->mlen = msg_end.nr - message.nr;
     inst->header_size = header_size;
@@ -3886,7 +3923,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &shadow_c,
                                  fs_reg lod, fs_reg lod2,
                                  const fs_reg &sample_index,
-                                const fs_reg &mcs, const fs_reg &sampler,
+                                const fs_reg &mcs,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
                                  fs_reg offset_value,
                                  unsigned coord_components,
                                  unsigned grad_components)
@@ -4089,8 +4128,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
     /* Generate the SEND. */
     inst->opcode = op;
     inst->src[0] = src_payload;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
     inst->base_mrf = -1;
     inst->mlen = mlen;
     inst->header_size = header_size;
@@ -4109,25 +4149,27 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
     const fs_reg &lod2 = inst->src[3];
     const fs_reg &sample_index = inst->src[4];
     const fs_reg &mcs = inst->src[5];
-   const fs_reg &sampler = inst->src[6];
-   const fs_reg &offset_value = inst->src[7];
-   assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
-   const unsigned coord_components = inst->src[8].ud;
-   const unsigned grad_components = inst->src[9].ud;
+   const fs_reg &surface = inst->src[6];
+   const fs_reg &sampler = inst->src[7];
+   const fs_reg &offset_value = inst->src[8];
+   assert(inst->src[9].file == IMM && inst->src[10].file == IMM);
+   const unsigned coord_components = inst->src[9].ud;
+   const unsigned grad_components = inst->src[10].ud;
  
     if (devinfo->gen >= 7) {
        lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
                                        shadow_c, lod, lod2, sample_index,
-                                      mcs, sampler, offset_value,
+                                      mcs, surface, sampler, offset_value,
                                        coord_components, grad_components);
     } else if (devinfo->gen >= 5) {
        lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
                                        shadow_c, lod, lod2, sample_index,
-                                      sampler, offset_value,
+                                      surface, sampler, offset_value,
                                        coord_components, grad_components);
     } else {
        lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
-                                      shadow_c, lod, lod2, sampler,
+                                      shadow_c, lod, lod2,
+                                      surface, sampler,
                                        coord_components, grad_components);
     }
  }
@@ -5267,6 +5309,40 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
  }
  
  bool
+fs_visitor::run_tes()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
+   payload.num_regs = 5;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_nir_code();
+
+   if (failed)
+      return false;
+
+   emit_urb_writes();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tes_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers();
+
+   return !failed;
+}
+
+bool
  fs_visitor::run_gs()
  {
     assert(stage == MESA_SHADER_GEOMETRY);
@@ -5587,7 +5663,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     }
  
     fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
-                  v.promoted_constants, v.runtime_check_aads_emit, "FS");
+                  v.promoted_constants, v.runtime_check_aads_emit,
+                  MESA_SHADER_FRAGMENT);
  
     if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
        g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
@@ -5712,7 +5789,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     }
  
     fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
-                  v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
+                  v8.promoted_constants, v8.runtime_check_aads_emit,
+                  MESA_SHADER_COMPUTE);
     if (INTEL_DEBUG & DEBUG_CS) {
        char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
                                     shader->info.label ? shader->info.label :
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h

index d09f2e4..5a7a0eb 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -81,7 +81,8 @@ public:
                struct gl_program *prog,
                const nir_shader *shader,
                unsigned dispatch_width,
-              int shader_time_index);
+              int shader_time_index,
+              const struct brw_vue_map *input_vue_map = NULL);
     fs_visitor(const struct brw_compiler *compiler, void *log_data,
                void *mem_ctx,
                struct brw_gs_compile *gs_compile,
@@ -109,6 +110,7 @@ public:
  
     bool run_fs(bool do_rep_send);
     bool run_vs(gl_clip_plane *clip_planes);
+   bool run_tes();
     bool run_gs();
     bool run_cs();
     void optimize();
@@ -124,6 +126,7 @@ public:
     void assign_urb_setup();
     void convert_attr_sources_to_hw_regs(fs_inst *inst);
     void assign_vs_urb_setup();
+   void assign_tes_urb_setup();
     void assign_gs_urb_setup();
     bool assign_regs(bool allow_spilling);
     void assign_regs_trivial();
@@ -204,6 +207,8 @@ public:
                       fs_reg mcs,
                       int gather_component,
                       bool is_cube_array,
+                     uint32_t surface,
+                     fs_reg surface_reg,
                       uint32_t sampler,
                       fs_reg sampler_reg);
     fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
@@ -249,6 +254,8 @@ public:
                                nir_intrinsic_instr *instr);
     void nir_emit_intrinsic(const brw::fs_builder &bld,
                             nir_intrinsic_instr *instr);
+   void nir_emit_tes_intrinsic(const brw::fs_builder &bld,
+                               nir_intrinsic_instr *instr);
     void nir_emit_ssbo_atomic(const brw::fs_builder &bld,
                               int op, nir_intrinsic_instr *instr);
     void nir_emit_shared_atomic(const brw::fs_builder &bld,
@@ -260,6 +267,7 @@ public:
     fs_reg get_nir_src(nir_src src);
     fs_reg get_nir_dest(nir_dest dest);
     fs_reg get_nir_image_deref(const nir_deref_var *deref);
+   fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
     void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                       unsigned wr_mask);
  
@@ -313,6 +321,8 @@ public:
     struct brw_stage_prog_data *prog_data;
     struct gl_program *prog;
  
+   const struct brw_vue_map *input_vue_map;
+
     int *virtual_grf_start;
     int *virtual_grf_end;
     brw::fs_live_variables *live_intervals;
@@ -415,7 +425,7 @@ public:
                  struct brw_stage_prog_data *prog_data,
                  unsigned promoted_constants,
                  bool runtime_check_aads_emit,
-                const char *stage_abbrev);
+                gl_shader_stage stage);
     ~fs_generator();
  
     void enable_debug(const char *shader_name);
@@ -438,6 +448,7 @@ private:
     void generate_linterp(fs_inst *inst, struct brw_reg dst,
                          struct brw_reg *src);
     void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+                     struct brw_reg surface_index,
                       struct brw_reg sampler_index);
     void generate_get_buffer_size(fs_inst *inst, struct brw_reg dst,
                                   struct brw_reg src,
@@ -526,7 +537,7 @@ private:
     bool runtime_check_aads_emit;
     bool debug_flag;
     const char *shader_name;
-   const char *stage_abbrev;
+   gl_shader_stage stage;
     void *mem_ctx;
  };
  
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp

index b3fb0c6..21f0b70 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -143,7 +143,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
     ir_expression *expr = ir->rhs->as_expression();
     bool found_vector = false;
     unsigned int i, vector_elements = 1;
-   ir_variable *op_var[3];
+   ir_variable *op_var[4];
  
     if (!expr)
        return visit_continue;
@@ -288,23 +288,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
        }
        break;
  
-   case ir_unop_any: {
-      ir_expression *temp;
-      temp = new(mem_ctx) ir_expression(ir_binop_logic_or,
-                                       element_type,
-                                       get_element(op_var[0], 0),
-                                       get_element(op_var[0], 1));
-
-      for (i = 2; i < vector_elements; i++) {
-        temp = new(mem_ctx) ir_expression(ir_binop_logic_or,
-                                          element_type,
-                                          get_element(op_var[0], i),
-                                          temp);
-      }
-      assign(ir, 0, temp);
-      break;
-   }
-
     case ir_binop_dot: {
        ir_expression *last = NULL;
        for (i = 0; i < vector_elements; i++) {
@@ -362,20 +345,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
     case ir_unop_noise:
        unreachable("noise should have been broken down to function call");
  
-   case ir_binop_bfm: {
-      /* Does not need to be scalarized, since its result will be identical
-       * for all channels.
-       */
-      ir_rvalue *op0 = get_element(op_var[0], 0);
-      ir_rvalue *op1 = get_element(op_var[1], 0);
-
-      assign(ir, 0, new(mem_ctx) ir_expression(expr->operation,
-                                               element_type,
-                                               op0,
-                                               op1));
-      break;
-   }
-
     case ir_binop_ubo_load:
     case ir_unop_get_buffer_size:
        unreachable("not yet supported");
@@ -397,22 +366,21 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
        }
        break;
  
-   case ir_triop_bfi: {
-      /* Only a single BFM is needed for multiple BFIs. */
-      ir_rvalue *op0 = get_element(op_var[0], 0);
-
+   case ir_quadop_bitfield_insert:
        for (i = 0; i < vector_elements; i++) {
+         ir_rvalue *op0 = get_element(op_var[0], i);
           ir_rvalue *op1 = get_element(op_var[1], i);
           ir_rvalue *op2 = get_element(op_var[2], i);
+         ir_rvalue *op3 = get_element(op_var[3], i);
  
           assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
                                                    element_type,
-                                                  op0->clone(mem_ctx, NULL),
+                                                  op0,
                                                    op1,
-                                                  op2));
+                                                  op2,
+                                                  op3));
        }
        break;
-   }
  
     case ir_unop_pack_snorm_2x16:
     case ir_unop_pack_snorm_4x8:
@@ -427,7 +395,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
     case ir_binop_ldexp:
     case ir_binop_vector_extract:
     case ir_triop_vector_insert:
-   case ir_quadop_bitfield_insert:
     case ir_quadop_vector:
     case ir_unop_ssbo_unsized_array_length:
        unreachable("should have been lowered");
diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp

index c5280ac..994c699 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -39,6 +39,8 @@
  
  using namespace brw;
  
+static const bool debug = false;
+
  /* Returns whether an instruction could co-issue if its immediate source were
   * replaced with a GRF source.
   */
@@ -265,7 +267,6 @@ fs_visitor::opt_combine_constants()
     if (cfg->num_blocks != 1)
        qsort(table.imm, table.len, sizeof(struct imm), compare);
  
-
     /* Insert MOVs to load the constant values into GRFs. */
     fs_reg reg(VGRF, alloc.allocate(dispatch_width / 8));
     reg.stride = 0;
@@ -299,7 +300,26 @@ fs_visitor::opt_combine_constants()
           reg->subreg_offset = table.imm[i].subreg_offset;
           reg->stride = 0;
           reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
-         assert(fabsf(reg->f) == table.imm[i].val);
+         assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
+                fabsf(reg->f) == table.imm[i].val);
+      }
+   }
+
+   if (debug) {
+      for (int i = 0; i < table.len; i++) {
+         struct imm *imm = &table.imm[i];
+
+         printf("%.3fF - block %3d, reg %3d sub %2d, Uses: (%2d, %2d), "
+                "IP: %4d to %4d, length %4d\n",
+                imm->val,
+                imm->block->num,
+                imm->nr,
+                imm->subreg_offset,
+                imm->must_promote,
+                imm->uses_by_coissue,
+                imm->first_use_ip,
+                imm->last_use_ip,
+                imm->last_use_ip - imm->first_use_ip);
        }
     }
  
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp

index 7fa6d84..b1134cf 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -111,14 +111,14 @@ fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
                             struct brw_stage_prog_data *prog_data,
                             unsigned promoted_constants,
                             bool runtime_check_aads_emit,
-                           const char *stage_abbrev)
+                           gl_shader_stage stage)
  
     : compiler(compiler), log_data(log_data),
       devinfo(compiler->devinfo), key(key),
       prog_data(prog_data),
       promoted_constants(promoted_constants),
       runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
-     stage_abbrev(stage_abbrev), mem_ctx(mem_ctx)
+     stage(stage), mem_ctx(mem_ctx)
  {
     p = rzalloc(mem_ctx, struct brw_codegen);
     brw_init_codegen(devinfo, p, mem_ctx);
@@ -702,6 +702,7 @@ fs_generator::generate_get_buffer_size(fs_inst *inst,
  
  void
  fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+                           struct brw_reg surface_index,
                             struct brw_reg sampler_index)
  {
     int msg_type = -1;
@@ -937,6 +938,14 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
              /* Set the offset bits in DWord 2. */
              brw_MOV(p, get_element_ud(header_reg, 2),
                         brw_imm_ud(inst->offset));
+         } else if (stage != MESA_SHADER_VERTEX &&
+                    stage != MESA_SHADER_FRAGMENT) {
+            /* The vertex and fragment stages have g0.2 set to 0, so
+             * header0.2 is 0 when g0 is copied. Other stages may not, so we
+             * must set it to 0 to avoid setting undesirable bits in the
+             * message.
+             */
+            brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
           }
  
           brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
@@ -949,14 +958,16 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
           ? prog_data->binding_table.gather_texture_start
           : prog_data->binding_table.texture_start;
  
-   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
+   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+       sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t surface = surface_index.ud;
        uint32_t sampler = sampler_index.ud;
  
        brw_SAMPLE(p,
                   retype(dst, BRW_REGISTER_TYPE_UW),
                   inst->base_mrf,
                   src,
-                 sampler + base_binding_table_index,
+                 surface + base_binding_table_index,
                   sampler % 16,
                   msg_type,
                   rlen,
@@ -965,19 +976,24 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
                   simd_mode,
                   return_format);
  
-      brw_mark_surface_used(prog_data, sampler + base_binding_table_index);
+      brw_mark_surface_used(prog_data, surface + base_binding_table_index);
     } else {
        /* Non-const sampler index */
  
        struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
        struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
  
        brw_push_insn_state(p);
        brw_set_default_mask_control(p, BRW_MASK_DISABLE);
        brw_set_default_access_mode(p, BRW_ALIGN_1);
  
-      /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
-      brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      if (memcmp(&surface_reg, &sampler_reg, sizeof(surface_reg)) == 0) {
+         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      } else {
+         brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+         brw_OR(p, addr, addr, surface_reg);
+      }
        if (base_binding_table_index)
           brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
        brw_AND(p, addr, addr, brw_imm_ud(0xfff));
@@ -2086,7 +2102,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
        case SHADER_OPCODE_TG4:
        case SHADER_OPCODE_TG4_OFFSET:
        case SHADER_OPCODE_SAMPLEINFO:
-        generate_tex(inst, dst, src[0], src[1]);
+        generate_tex(inst, dst, src[0], src[1], src[2]);
          break;
        case FS_OPCODE_DDX_COARSE:
        case FS_OPCODE_DDX_FINE:
@@ -2329,8 +2345,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
     compiler->shader_debug_log(log_data,
                                "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
                                "%d:%d spills:fills, Promoted %u constants, "
-                              "compacted %d to %d bytes.\n",
-                              stage_abbrev, dispatch_width, before_size / 16,
+                              "compacted %d to %d bytes.",
+                              _mesa_shader_stage_to_abbrev(stage),
+                              dispatch_width, before_size / 16,
                                loop_count, cfg->cycle_count, spill_count,
                                fill_count, promoted_constants, before_size,
                                after_size);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp

index 82a6ce2..eded5a9 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -43,10 +43,10 @@ fs_visitor::emit_nir_code()
     nir_emit_system_values();
  
     /* get the main function and emit it */
-   nir_foreach_overload(nir, overload) {
-      assert(strcmp(overload->function->name, "main") == 0);
-      assert(overload->impl);
-      nir_emit_impl(overload->impl);
+   nir_foreach_function(nir, function) {
+      assert(strcmp(function->name, "main") == 0);
+      assert(function->impl);
+      nir_emit_impl(function->impl);
     }
  }
  
@@ -123,6 +123,7 @@ fs_visitor::nir_setup_outputs()
  
        switch (stage) {
        case MESA_SHADER_VERTEX:
+      case MESA_SHADER_TESS_EVAL:
        case MESA_SHADER_GEOMETRY: {
           unsigned location = var->data.location;
           nir_setup_single_output_varying(&reg, var->type, &location);
@@ -212,6 +213,20 @@ emit_system_values_block(nir_block *block, void *void_visitor)
              *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
           break;
  
+      case nir_intrinsic_load_base_instance:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
+         break;
+
+      case nir_intrinsic_load_draw_id:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
+         break;
+
        case nir_intrinsic_load_invocation_id:
           assert(v->stage == MESA_SHADER_GEOMETRY);
           reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
@@ -328,10 +343,10 @@ fs_visitor::nir_emit_system_values()
        nir_system_values[i] = fs_reg();
     }
  
-   nir_foreach_overload(nir, overload) {
-      assert(strcmp(overload->function->name, "main") == 0);
-      assert(overload->impl);
-      nir_foreach_block(overload->impl, emit_system_values_block, this);
+   nir_foreach_function(nir, function) {
+      assert(strcmp(function->name, "main") == 0);
+      assert(function->impl);
+      nir_foreach_block(function->impl, emit_system_values_block, this);
     }
  }
  
@@ -434,6 +449,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
        case MESA_SHADER_VERTEX:
           nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
           break;
+      case MESA_SHADER_TESS_EVAL:
+         nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
        case MESA_SHADER_GEOMETRY:
           nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
           break;
@@ -776,9 +794,41 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        unreachable("Should have been lowered by borrow_to_arith().");
  
     case nir_op_umod:
+   case nir_op_irem:
+      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+       * appears that our hardware just does the right thing for signed
+       * remainder.
+       */
        bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
        break;
  
+   case nir_op_imod: {
+      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
+      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+
+      /* Math instructions don't support conditional mod */
+      inst = bld.MOV(bld.null_reg_d(), result);
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      /* Now, we need to determine if signs of the sources are different.
+       * When we XOR the sources, the top bit is 0 if they are the same and 1
+       * if they are different.  We can then use a conditional modifier to
+       * turn that into a predicate.  This leads us to an XOR.l instruction.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
+      inst = bld.XOR(tmp, op[0], op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      /* If the result of the initial remainder operation is non-zero and the
+       * two sources have different signs, add in a copy of op[1] to get the
+       * final integer modulus value.
+       */
+      inst = bld.ADD(result, result, op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
     case nir_op_flt:
     case nir_op_ilt:
     case nir_op_ult:
@@ -832,12 +882,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
     case nir_op_fdot2:
     case nir_op_fdot3:
     case nir_op_fdot4:
-   case nir_op_bany2:
-   case nir_op_bany3:
-   case nir_op_bany4:
-   case nir_op_ball2:
-   case nir_op_ball3:
-   case nir_op_ball4:
     case nir_op_ball_fequal2:
     case nir_op_ball_iequal2:
     case nir_op_ball_fequal3:
@@ -922,6 +966,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        inst->saturate = instr->dest.saturate;
        break;
  
+   case nir_op_fquantize2f16: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
+
+      /* The destination stride must be at least as big as the source stride. */
+      tmp.type = BRW_REGISTER_TYPE_W;
+      tmp.stride = 2;
+
+      bld.emit(BRW_OPCODE_F32TO16, tmp, op[0]);
+      inst = bld.emit(BRW_OPCODE_F16TO32, result, tmp);
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
     case nir_op_fmin:
     case nir_op_imin:
     case nir_op_umin:
@@ -1006,6 +1063,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
  
     case nir_op_ubitfield_extract:
     case nir_op_ibitfield_extract:
+      unreachable("should have been lowered");
+   case nir_op_ubfe:
+   case nir_op_ibfe:
        bld.BFE(result, op[2], op[1], op[0]);
        break;
     case nir_op_bfm:
@@ -1016,8 +1076,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        break;
  
     case nir_op_bitfield_insert:
-      unreachable("not reached: should be handled by "
-                  "lower_instructions::bitfield_insert_to_bfm_bfi");
+      unreachable("not reached: should have been lowered");
  
     case nir_op_ishl:
        bld.SHL(result, op[0], op[1]);
@@ -1724,6 +1783,24 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
     }
  }
  
+fs_reg
+fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+   if (const_value) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into instr->const_index[0].
+       */
+      assert(const_value->u[0] == 0);
+      return fs_reg();
+   }
+
+   return get_nir_src(*offset_src);
+}
+
  void
  fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
@@ -1740,7 +1817,9 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
  
     case nir_intrinsic_load_vertex_id_zero_base:
     case nir_intrinsic_load_base_vertex:
-   case nir_intrinsic_load_instance_id: {
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id: {
        gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
        fs_reg val = nir_system_values[sv];
        assert(val.file != BAD_FILE);
@@ -1756,6 +1835,127 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
  }
  
  void
+fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
+                                   nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+   struct brw_tes_prog_data *tes_prog_data = (struct brw_tes_prog_data *) prog_data;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
+      break;
+   case nir_intrinsic_load_tess_coord:
+      /* gl_TessCoord is part of the payload in g1-3 */
+      for (unsigned i = 0; i < 3; i++) {
+         bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
+      }
+      break;
+
+   case nir_intrinsic_load_tess_level_outer:
+      /* When the TES reads gl_TessLevelOuter, we ensure that the patch header
+       * appears as a push-model input.  So, we can simply use the ATTR file
+       * rather than issuing URB read messages.  The data is stored in the
+       * high DWords in reverse order - DWord 7 contains .x, DWord 6 contains
+       * .y, and so on.
+       */
+      switch (tes_prog_data->domain) {
+      case BRW_TESS_DOMAIN_QUAD:
+         for (unsigned i = 0; i < 4; i++)
+            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
+         break;
+      case BRW_TESS_DOMAIN_TRI:
+         for (unsigned i = 0; i < 3; i++)
+            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
+         break;
+      case BRW_TESS_DOMAIN_ISOLINE:
+         for (unsigned i = 0; i < 2; i++)
+            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
+         break;
+      }
+      break;
+
+   case nir_intrinsic_load_tess_level_inner:
+      /* When the TES reads gl_TessLevelInner, we ensure that the patch header
+       * appears as a push-model input.  So, we can simply use the ATTR file
+       * rather than issuing URB read messages.
+       */
+      switch (tes_prog_data->domain) {
+      case BRW_TESS_DOMAIN_QUAD:
+         bld.MOV(dest, component(fs_reg(ATTR, 0), 3));
+         bld.MOV(offset(dest, bld, 1), component(fs_reg(ATTR, 0), 2));
+         break;
+      case BRW_TESS_DOMAIN_TRI:
+         bld.MOV(dest, component(fs_reg(ATTR, 0), 4));
+         break;
+      case BRW_TESS_DOMAIN_ISOLINE:
+         /* ignore - value is undefined */
+         break;
+      }
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Arbitrarily only push up to 32 vec4 slots worth of data,
+          * which is 16 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 32;
+         if (imm_offset < max_push_slots) {
+            fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
+            for (int i = 0; i < instr->num_components; i++) {
+               bld.MOV(offset(dest, bld, i),
+                       component(src, 4 * (imm_offset % 2) + i));
+            }
+            tes_prog_data->base.urb_read_length =
+               MAX2(tes_prog_data->base.urb_read_length,
+                    DIV_ROUND_UP(imm_offset + 1, 2));
+         } else {
+            /* Replicate the patch handle to all enabled channels */
+            const fs_reg srcs[] = {
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
+            };
+            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
+
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
+            inst->mlen = 1;
+            inst->offset = imm_offset;
+            inst->base_mrf = -1;
+            inst->regs_written = instr->num_components;
+         }
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = {
+            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+            indirect_offset
+         };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload);
+         inst->mlen = 2;
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->regs_written = instr->num_components;
+      }
+      break;
+   }
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
  fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
  {
@@ -2130,6 +2330,82 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
        nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
        break;
  
+   case nir_intrinsic_load_shared: {
+      assert(devinfo->gen >= 7);
+
+      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+      /* Get the offset to read from */
+      fs_reg offset_reg;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      if (const_offset) {
+         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]);
+      } else {
+         offset_reg = vgrf(glsl_type::uint_type);
+         bld.ADD(offset_reg,
+                 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(instr->const_index[0]));
+      }
+
+      /* Read the vector */
+      fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+                                             1 /* dims */,
+                                             instr->num_components,
+                                             BRW_PREDICATE_NONE);
+      read_result.type = dest.type;
+      for (int i = 0; i < instr->num_components; i++)
+         bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
+
+      break;
+   }
+
+   case nir_intrinsic_store_shared: {
+      assert(devinfo->gen >= 7);
+
+      /* Block index */
+      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+      /* Value */
+      fs_reg val_reg = get_nir_src(instr->src[0]);
+
+      /* Writemask */
+      unsigned writemask = instr->const_index[1];
+
+      /* Combine groups of consecutive enabled channels in one write
+       * message. We use ffs to find the first enabled channel and then ffs on
+       * the bit-inverse, down-shifted writemask to determine the length of
+       * the block of enabled bits.
+       */
+      while (writemask) {
+         unsigned first_component = ffs(writemask) - 1;
+         unsigned length = ffs(~(writemask >> first_component)) - 1;
+         fs_reg offset_reg;
+
+         nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+         if (const_offset) {
+            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] +
+                                    4 * first_component);
+         } else {
+            offset_reg = vgrf(glsl_type::uint_type);
+            bld.ADD(offset_reg,
+                    retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
+                    brw_imm_ud(instr->const_index[0] + 4 * first_component));
+         }
+
+         emit_untyped_write(bld, surf_index, offset_reg,
+                            offset(val_reg, bld, first_component),
+                            1 /* dims */, length,
+                            BRW_PREDICATE_NONE);
+
+         /* Clear the bits in the writemask that we just wrote, then try
+          * again to see if more channels are left.
+          */
+         writemask &= (15 << (first_component + length));
+      }
+
+      break;
+   }
+
     default:
        nir_emit_intrinsic(bld, instr);
        break;
@@ -2471,82 +2747,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        break;
     }
  
-   case nir_intrinsic_load_shared: {
-      assert(devinfo->gen >= 7);
-
-      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
-
-      /* Get the offset to read from */
-      fs_reg offset_reg;
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-      if (const_offset) {
-         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]);
-      } else {
-         offset_reg = vgrf(glsl_type::uint_type);
-         bld.ADD(offset_reg,
-                 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
-                 brw_imm_ud(instr->const_index[0]));
-      }
-
-      /* Read the vector */
-      fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
-                                             1 /* dims */,
-                                             instr->num_components,
-                                             BRW_PREDICATE_NONE);
-      read_result.type = dest.type;
-      for (int i = 0; i < instr->num_components; i++)
-         bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
-
-      break;
-   }
-
-   case nir_intrinsic_store_shared: {
-      assert(devinfo->gen >= 7);
-
-      /* Block index */
-      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
-
-      /* Value */
-      fs_reg val_reg = get_nir_src(instr->src[0]);
-
-      /* Writemask */
-      unsigned writemask = instr->const_index[1];
-
-      /* Combine groups of consecutive enabled channels in one write
-       * message. We use ffs to find the first enabled channel and then ffs on
-       * the bit-inverse, down-shifted writemask to determine the length of
-       * the block of enabled bits.
-       */
-      while (writemask) {
-         unsigned first_component = ffs(writemask) - 1;
-         unsigned length = ffs(~(writemask >> first_component)) - 1;
-         fs_reg offset_reg;
-
-         nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
-         if (const_offset) {
-            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] +
-                                    4 * first_component);
-         } else {
-            offset_reg = vgrf(glsl_type::uint_type);
-            bld.ADD(offset_reg,
-                    retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
-                    brw_imm_ud(instr->const_index[0] + 4 * first_component));
-         }
-
-         emit_untyped_write(bld, surf_index, offset_reg,
-                            offset(val_reg, bld, first_component),
-                            1 /* dims */, length,
-                            BRW_PREDICATE_NONE);
-
-         /* Clear the bits in the writemask that we just wrote, then try
-          * again to see if more channels are left.
-          */
-         writemask &= (15 << (first_component + length));
-      }
-
-      break;
-   }
-
     case nir_intrinsic_load_input: {
        fs_reg src;
        if (stage == MESA_SHADER_VERTEX) {
@@ -2790,7 +2990,9 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
  void
  fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
  {
+   unsigned texture = instr->texture_index;
     unsigned sampler = instr->sampler_index;
+   fs_reg texture_reg(brw_imm_ud(texture));
     fs_reg sampler_reg(brw_imm_ud(sampler));
  
     int gather_component = instr->component;
@@ -2803,6 +3005,10 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
  
     fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset;
  
+   /* Our hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      lod = brw_imm_d(0);
+
     for (unsigned i = 0; i < instr->num_srcs; i++) {
        fs_reg src = get_nir_src(instr->src[i].src);
        switch (instr->src[i].src_type) {
@@ -2857,9 +3063,9 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        case nir_tex_src_projector:
           unreachable("should be lowered");
  
-      case nir_tex_src_sampler_offset: {
-         /* Figure out the highest possible sampler index and mark it as used */
-         uint32_t max_used = sampler + instr->sampler_array_size - 1;
+      case nir_tex_src_texture_offset: {
+         /* Figure out the highest possible texture index and mark it as used */
+         uint32_t max_used = texture + instr->texture_array_size - 1;
           if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
              max_used += stage_prog_data->binding_table.gather_texture_start;
           } else {
@@ -2868,6 +3074,14 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
           brw_mark_surface_used(prog_data, max_used);
  
           /* Emit code to evaluate the actual indexing expression */
+         texture_reg = vgrf(glsl_type::uint_type);
+         bld.ADD(texture_reg, src, brw_imm_ud(texture));
+         texture_reg = bld.emit_uniformize(texture_reg);
+         break;
+      }
+
+      case nir_tex_src_sampler_offset: {
+         /* Emit code to evaluate the actual indexing expression */
           sampler_reg = vgrf(glsl_type::uint_type);
           bld.ADD(sampler_reg, src, brw_imm_ud(sampler));
           sampler_reg = bld.emit_uniformize(sampler_reg);
@@ -2882,8 +3096,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
     if (instr->op == nir_texop_txf_ms ||
         instr->op == nir_texop_samples_identical) {
        if (devinfo->gen >= 7 &&
-          key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
-         mcs = emit_mcs_fetch(coordinate, instr->coord_components, sampler_reg);
+          key_tex->compressed_multisample_layout_mask & (1 << texture)) {
+         mcs = emit_mcs_fetch(coordinate, instr->coord_components, texture_reg);
        } else {
           mcs = brw_imm_ud(0u);
        }
@@ -2920,7 +3134,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
        fs_inst *inst = bld.emit(SHADER_OPCODE_SAMPLEINFO, dst,
                                 bld.vgrf(BRW_REGISTER_TYPE_D, 1),
-                               sampler_reg);
+                               texture_reg, texture_reg);
        inst->mlen = 1;
        inst->header_size = 1;
        inst->base_mrf = -1;
@@ -2934,7 +3148,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
     emit_texture(op, dest_type, coordinate, instr->coord_components,
                  shadow_comparitor, lod, lod2, lod_components, sample_index,
                  tex_offset, mcs, gather_component,
-                is_cube_array, sampler, sampler_reg);
+                is_cube_array, texture, texture_reg, sampler, sampler_reg);
  
     fs_reg dest = get_nir_dest(instr->dest);
     dest.type = this->result.type;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

index 75ca1da..0ff5cd6 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -43,9 +43,14 @@ fs_visitor::emit_vs_system_value(int location)
     switch (location) {
     case SYSTEM_VALUE_BASE_VERTEX:
        reg->reg_offset = 0;
-      vs_prog_data->uses_vertexid = true;
+      vs_prog_data->uses_basevertex = true;
+      break;
+   case SYSTEM_VALUE_BASE_INSTANCE:
+      reg->reg_offset = 1;
+      vs_prog_data->uses_baseinstance = true;
        break;
     case SYSTEM_VALUE_VERTEX_ID:
+      unreachable("should have been lowered");
     case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
        reg->reg_offset = 2;
        vs_prog_data->uses_vertexid = true;
@@ -54,6 +59,16 @@ fs_visitor::emit_vs_system_value(int location)
        reg->reg_offset = 3;
        vs_prog_data->uses_instanceid = true;
        break;
+   case SYSTEM_VALUE_DRAW_ID:
+      if (nir->info.system_values_read &
+          (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
+           BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
+           BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
+           BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID)))
+         reg->nr += 4;
+      reg->reg_offset = 0;
+      vs_prog_data->uses_drawid = true;
+      break;
     default:
        unreachable("not reached");
     }
@@ -64,12 +79,12 @@ fs_visitor::emit_vs_system_value(int location)
  /* Sample from the MCS surface attached to this multisample texture. */
  fs_reg
  fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
-                           const fs_reg &sampler)
+                           const fs_reg &texture)
  {
     const fs_reg dest = vgrf(glsl_type::uvec4_type);
     const fs_reg srcs[] = {
        coordinate, fs_reg(), fs_reg(), fs_reg(), fs_reg(), fs_reg(),
-      sampler, fs_reg(), brw_imm_ud(components), brw_imm_d(0)
+      texture, texture, fs_reg(), brw_imm_ud(components), brw_imm_d(0)
     };
     fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
                              ARRAY_SIZE(srcs));
@@ -93,6 +108,8 @@ fs_visitor::emit_texture(ir_texture_opcode op,
                           fs_reg mcs,
                           int gather_component,
                           bool is_cube_array,
+                         uint32_t surface,
+                         fs_reg surface_reg,
                           uint32_t sampler,
                           fs_reg sampler_reg)
  {
@@ -132,7 +149,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
     fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
     const fs_reg srcs[] = {
        coordinate, shadow_c, lod, lod2,
-      sample_index, mcs, sampler_reg, offset_value,
+      sample_index, mcs, surface_reg, sampler_reg, offset_value,
        brw_imm_d(coord_components), brw_imm_d(grad_components)
     };
     enum opcode opcode;
@@ -185,7 +202,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
  
     if (op == ir_tg4) {
        if (gather_component == 1 &&
-          key_tex->gather_channel_quirk_mask & (1 << sampler)) {
+          key_tex->gather_channel_quirk_mask & (1 << surface)) {
           /* gather4 sampler is broken for green channel on RG32F --
            * we must ask for blue instead.
            */
@@ -195,7 +212,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
        }
  
        if (devinfo->gen == 6)
-         emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
+         emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], dst);
     }
  
     /* fixup #layers for cube map arrays */
@@ -700,7 +717,10 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
     fs_reg sources[8];
     fs_reg urb_handle;
  
-   urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+   if (stage == MESA_SHADER_TESS_EVAL)
+      urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD));
+   else
+      urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
  
     /* If we don't have any valid slots to write, just do a minimal urb write
      * send to terminate the shader.  This includes 1 slot of undefined data,
@@ -934,9 +954,11 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
                         struct gl_program *prog,
                         const nir_shader *shader,
                         unsigned dispatch_width,
-                       int shader_time_index)
+                       int shader_time_index,
+                       const struct brw_vue_map *input_vue_map)
     : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
       key(key), gs_compile(NULL), prog_data(prog_data), prog(prog),
+     input_vue_map(input_vue_map),
       dispatch_width(dispatch_width),
       shader_time_index(shader_time_index),
       bld(fs_builder(this, dispatch_width).at_end())
@@ -972,6 +994,9 @@ fs_visitor::init()
     case MESA_SHADER_VERTEX:
        key_tex = &((const brw_vs_prog_key *) key)->tex;
        break;
+   case MESA_SHADER_TESS_EVAL:
+      key_tex = &((const brw_tes_prog_key *) key)->tex;
+      break;
     case MESA_SHADER_GEOMETRY:
        key_tex = &((const brw_gs_prog_key *) key)->tex;
        break;
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp

index 31d29ec..234afd5 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -27,7 +27,6 @@
  #include "brw_nir.h"
  #include "brw_program.h"
  #include "glsl/ir_optimization.h"
-#include "glsl/glsl_parser_extras.h"
  #include "program/program.h"
  #include "main/shaderapi.h"
  #include "main/uniforms.h"
@@ -42,6 +41,8 @@ brw_shader_precompile(struct gl_context *ctx,
                        struct gl_shader_program *sh_prog)
  {
     struct gl_shader *vs = sh_prog->_LinkedShaders[MESA_SHADER_VERTEX];
+   struct gl_shader *tcs = sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
+   struct gl_shader *tes = sh_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
     struct gl_shader *gs = sh_prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
     struct gl_shader *fs = sh_prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
     struct gl_shader *cs = sh_prog->_LinkedShaders[MESA_SHADER_COMPUTE];
@@ -52,6 +53,12 @@ brw_shader_precompile(struct gl_context *ctx,
     if (gs && !brw_gs_precompile(ctx, sh_prog, gs->Program))
        return false;
  
+   if (tes && !brw_tes_precompile(ctx, sh_prog, tes->Program))
+      return false;
+
+   if (tcs && !brw_tcs_precompile(ctx, sh_prog, tcs->Program))
+      return false;
+
     if (vs && !brw_vs_precompile(ctx, sh_prog, vs->Program))
        return false;
  
@@ -119,14 +126,12 @@ process_glsl_ir(gl_shader_stage stage,
      */
     brw_lower_packing_builtins(brw, shader->Stage, shader->ir);
     do_mat_op_to_vec(shader->ir);
-   const int bitfield_insert = brw->gen >= 7 ? BITFIELD_INSERT_TO_BFM_BFI : 0;
     lower_instructions(shader->ir,
                        MOD_TO_FLOOR |
                        DIV_TO_MUL_RCP |
                        SUB_TO_ADD_NEG |
                        EXP_TO_EXP2 |
                        LOG_TO_LOG2 |
-                      bitfield_insert |
                        LDEXP_TO_ARITH |
                        CARRY_TO_ARITH |
                        BORROW_TO_ARITH);
diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp

index d571ecd..c83b272 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
+++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
@@ -279,7 +279,7 @@ lower_texture_grad_visitor::visit_leave(ir_texture *ir)
  
        /* 2. quotient rule */
        ir_variable *recip = temp(mem_ctx, glsl_type::float_type, "recip");
-      EMIT(assign(recip, div(new(mem_ctx) ir_constant(1.0f), swizzle_z(Q))));
+      EMIT(assign(recip, expr(ir_unop_rcp, swizzle_z(Q))));
  
        ir_variable *dx = temp(mem_ctx, glsl_type::vec2_type, "dx");
        ir_variable *dy = temp(mem_ctx, glsl_type::vec2_type, "dy");
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c

index ed47172..c5f6c4f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -409,8 +409,8 @@ set_read_rb_tex_image(struct gl_context *ctx, struct fb_tex_blit_state *blit,
     blit->baseLevelSave = tex_obj->BaseLevel;
     blit->maxLevelSave = tex_obj->MaxLevel;
     blit->stencilSamplingSave = tex_obj->StencilSampling;
-   blit->sampler = _mesa_meta_setup_sampler(ctx, tex_obj, *target,
-                                            GL_NEAREST, level);
+   blit->samp_obj = _mesa_meta_setup_sampler(ctx, tex_obj, *target,
+                                             GL_NEAREST, level);
     return true;
  }
  
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c

index 14ad172..0985c2c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -27,15 +27,43 @@
  #include "glsl/nir/nir_builder.h"
  #include "program/prog_to_nir.h"
  
-struct remap_vs_attrs_state {
+static bool
+is_input(nir_intrinsic_instr *intrin)
+{
+   return intrin->intrinsic == nir_intrinsic_load_input ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
+}
+
+static bool
+is_output(nir_intrinsic_instr *intrin)
+{
+   return intrin->intrinsic == nir_intrinsic_load_output ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
+          intrin->intrinsic == nir_intrinsic_store_output ||
+          intrin->intrinsic == nir_intrinsic_store_per_vertex_output;
+}
+
+/**
+ * In many cases, we just add the base and offset together, so there's no
+ * reason to keep them separate.  Sometimes, combining them is essential:
+ * if a shader only accesses part of a compound variable (such as a matrix
+ * or array), the variable's base may not actually exist in the VUE map.
+ *
+ * This pass adds constant offsets to instr->const_index[0], and resets
+ * the offset source to 0.  Non-constant offsets remain unchanged - since
+ * we don't know what part of a compound variable is accessed, we allocate
+ * storage for the entire thing.
+ */
+struct add_const_offset_to_base_params {
     nir_builder b;
-   uint64_t inputs_read;
+   nir_variable_mode mode;
  };
  
  static bool
-remap_vs_attrs(nir_block *block, void *void_state)
+add_const_offset_to_base(nir_block *block, void *closure)
  {
-   struct remap_vs_attrs_state *state = void_state;
+   struct add_const_offset_to_base_params *params = closure;
+   nir_builder *b = &params->b;
  
     nir_foreach_instr_safe(block, instr) {
        if (instr->type != nir_instr_type_intrinsic)
@@ -43,30 +71,120 @@ remap_vs_attrs(nir_block *block, void *void_state)
  
        nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
  
+      if ((params->mode == nir_var_shader_in && is_input(intrin)) ||
+          (params->mode == nir_var_shader_out && is_output(intrin))) {
+         nir_src *offset = nir_get_io_offset_src(intrin);
+         nir_const_value *const_offset = nir_src_as_const_value(*offset);
+
+         if (const_offset) {
+            intrin->const_index[0] += const_offset->u[0];
+            b->cursor = nir_before_instr(&intrin->instr);
+            nir_instr_rewrite_src(&intrin->instr, offset,
+                                  nir_src_for_ssa(nir_imm_int(b, 0)));
+         }
+      }
+   }
+   return true;
+
+}
+
+static bool
+remap_vs_attrs(nir_block *block, void *closure)
+{
+   GLbitfield64 inputs_read = *((GLbitfield64 *) closure);
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
        if (intrin->intrinsic == nir_intrinsic_load_input) {
           /* Attributes come in a contiguous block, ordered by their
            * gl_vert_attrib value.  That means we can compute the slot
            * number for an attribute by masking out the enabled attributes
            * before it and counting the bits.
            */
-         nir_const_value *const_offset = nir_src_as_const_value(intrin->src[0]);
+         int attr = intrin->const_index[0];
+         int slot = _mesa_bitcount_64(inputs_read & BITFIELD64_MASK(attr));
  
-         /* We set EmitNoIndirect for VS inputs, so there are no indirects. */
-         assert(const_offset);
+         intrin->const_index[0] = 4 * slot;
+      }
+   }
+   return true;
+}
  
-         int attr = intrin->const_index[0] + const_offset->u[0];
-         int slot = _mesa_bitcount_64(state->inputs_read &
-                                      BITFIELD64_MASK(attr));
+static bool
+remap_inputs_with_vue_map(nir_block *block, void *closure)
+{
+   const struct brw_vue_map *vue_map = closure;
  
-         /* The NIR -> FS pass will just add the base and offset together, so
-          * there's no reason to keep them separate.  Just put it all in
-          * const_index[0] and set the offset src[0] to load_const(0).
-          */
-         intrin->const_index[0] = 4 * slot;
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
  
-         state->b.cursor = nir_before_instr(&intrin->instr);
-         nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
-                               nir_src_for_ssa(nir_imm_int(&state->b, 0)));
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      if (intrin->intrinsic == nir_intrinsic_load_input ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
+         int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
+         assert(vue_slot != -1);
+         intrin->const_index[0] = vue_slot;
+      }
+   }
+   return true;
+}
+
+struct remap_patch_urb_offsets_state {
+   nir_builder b;
+   struct brw_vue_map vue_map;
+};
+
+static bool
+remap_patch_urb_offsets(nir_block *block, void *closure)
+{
+   struct remap_patch_urb_offsets_state *state = closure;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      gl_shader_stage stage = state->b.shader->stage;
+
+      if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
+          (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
+         int vue_slot = state->vue_map.varying_to_slot[intrin->const_index[0]];
+         assert(vue_slot != -1);
+         intrin->const_index[0] = vue_slot;
+
+         nir_src *vertex = nir_get_io_vertex_index_src(intrin);
+         if (vertex) {
+            nir_const_value *const_vertex = nir_src_as_const_value(*vertex);
+            if (const_vertex) {
+               intrin->const_index[0] += const_vertex->u[0] *
+                                         state->vue_map.num_per_vertex_slots;
+            } else {
+               state->b.cursor = nir_before_instr(&intrin->instr);
+
+               /* Multiply by the number of per-vertex slots. */
+               nir_ssa_def *vertex_offset =
+                  nir_imul(&state->b,
+                           nir_ssa_for_src(&state->b, *vertex, 1),
+                           nir_imm_int(&state->b,
+                                       state->vue_map.num_per_vertex_slots));
+
+               /* Add it to the existing offset */
+               nir_src *offset = nir_get_io_offset_src(intrin);
+               nir_ssa_def *total_offset =
+                  nir_iadd(&state->b, vertex_offset,
+                           nir_ssa_for_src(&state->b, *offset, 1));
+
+               nir_instr_rewrite_src(&intrin->instr, offset,
+                                     nir_src_for_ssa(total_offset));
+            }
+         }
        }
     }
     return true;
@@ -77,6 +195,10 @@ brw_nir_lower_inputs(nir_shader *nir,
                       const struct brw_device_info *devinfo,
                       bool is_scalar)
  {
+   struct add_const_offset_to_base_params params = {
+      .mode = nir_var_shader_in
+   };
+
     switch (nir->stage) {
     case MESA_SHADER_VERTEX:
        /* Start with the location of the variable's base. */
@@ -97,23 +219,23 @@ brw_nir_lower_inputs(nir_shader *nir,
            * key->inputs_read since the two are identical aside from Gen4-5
            * edge flag differences.
            */
-         struct remap_vs_attrs_state remap_state = {
-            .inputs_read = nir->info.inputs_read,
-         };
+         GLbitfield64 inputs_read = nir->info.inputs_read;
  
           /* This pass needs actual constants */
           nir_opt_constant_folding(nir);
  
-         nir_foreach_overload(nir, overload) {
-            if (overload->impl) {
-               nir_builder_init(&remap_state.b, overload->impl);
-               nir_foreach_block(overload->impl, remap_vs_attrs, &remap_state);
+         nir_foreach_function(nir, function) {
+            if (function->impl) {
+               nir_builder_init(&params.b, function->impl);
+               nir_foreach_block(function->impl, add_const_offset_to_base, &params);
+               nir_foreach_block(function->impl, remap_vs_attrs, &inputs_read);
              }
           }
        }
        break;
+   case MESA_SHADER_TESS_CTRL:
     case MESA_SHADER_GEOMETRY: {
-      if (!is_scalar) {
+      if (!is_scalar && nir->stage == MESA_SHADER_GEOMETRY) {
           foreach_list_typed(nir_variable, var, node, &nir->inputs) {
              var->data.driver_location = var->data.location;
           }
@@ -135,17 +257,52 @@ brw_nir_lower_inputs(nir_shader *nir,
           GLbitfield64 inputs_read =
              nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID;
           brw_compute_vue_map(devinfo, &input_vue_map, inputs_read,
-                             nir->info.separate_shader);
+                             nir->info.separate_shader ||
+                             nir->stage == MESA_SHADER_TESS_CTRL);
  
-         /* Start with the slot for the variable's base. */
           foreach_list_typed(nir_variable, var, node, &nir->inputs) {
-            assert(input_vue_map.varying_to_slot[var->data.location] != -1);
-            var->data.driver_location =
-               input_vue_map.varying_to_slot[var->data.location];
+            var->data.driver_location = var->data.location;
           }
  
           /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
           nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
+
+         /* This pass needs actual constants */
+         nir_opt_constant_folding(nir);
+
+         nir_foreach_function(nir, function) {
+            if (function->impl) {
+               nir_builder_init(&params.b, function->impl);
+               nir_foreach_block(function->impl, add_const_offset_to_base, &params);
+               nir_foreach_block(function->impl, remap_inputs_with_vue_map,
+                                 &input_vue_map);
+            }
+         }
+      }
+      break;
+   }
+   case MESA_SHADER_TESS_EVAL: {
+      struct remap_patch_urb_offsets_state state;
+      brw_compute_tess_vue_map(&state.vue_map,
+                               nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
+                               nir->info.patch_inputs_read);
+
+      foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+         var->data.driver_location = var->data.location;
+      }
+
+      nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
+
+      /* This pass needs actual constants */
+      nir_opt_constant_folding(nir);
+
+      nir_foreach_function(nir, function) {
+         if (function->impl) {
+            nir_builder_init(&params.b, function->impl);
+            nir_foreach_block(function->impl, add_const_offset_to_base, &params);
+            nir_builder_init(&state.b, function->impl);
+            nir_foreach_block(function->impl, remap_patch_urb_offsets, &state);
+         }
        }
        break;
     }
@@ -164,10 +321,13 @@ brw_nir_lower_inputs(nir_shader *nir,
  }
  
  static void
-brw_nir_lower_outputs(nir_shader *nir, bool is_scalar)
+brw_nir_lower_outputs(nir_shader *nir,
+                      const struct brw_device_info *devinfo,
+                      bool is_scalar)
  {
     switch (nir->stage) {
     case MESA_SHADER_VERTEX:
+   case MESA_SHADER_TESS_EVAL:
     case MESA_SHADER_GEOMETRY:
        if (is_scalar) {
           nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
@@ -178,6 +338,34 @@ brw_nir_lower_outputs(nir_shader *nir, bool is_scalar)
              var->data.driver_location = var->data.location;
        }
        break;
+   case MESA_SHADER_TESS_CTRL: {
+      struct add_const_offset_to_base_params params = {
+         .mode = nir_var_shader_out
+      };
+
+      struct remap_patch_urb_offsets_state state;
+      brw_compute_tess_vue_map(&state.vue_map, nir->info.outputs_written,
+                               nir->info.patch_outputs_written);
+
+      nir_foreach_variable(var, &nir->outputs) {
+         var->data.driver_location = var->data.location;
+      }
+
+      nir_lower_io(nir, nir_var_shader_out, type_size_vec4);
+
+      /* This pass needs actual constants */
+      nir_opt_constant_folding(nir);
+
+      nir_foreach_function(nir, function) {
+         if (function->impl) {
+            nir_builder_init(&params.b, function->impl);
+            nir_foreach_block(function->impl, add_const_offset_to_base, &params);
+            nir_builder_init(&state.b, function->impl);
+            nir_foreach_block(function->impl, remap_patch_urb_offsets, &state);
+         }
+      }
+      break;
+   }
     case MESA_SHADER_FRAGMENT:
        nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
                                 type_size_scalar);
@@ -217,42 +405,23 @@ brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
     }
  }
  
-#include "util/debug.h"
-
-static bool
-should_clone_nir()
+static void
+brw_nir_lower_shared(nir_shader *nir)
  {
-   static int should_clone = -1;
-   if (should_clone < 1)
-      should_clone = env_var_as_boolean("NIR_TEST_CLONE", false);
-
-   return should_clone;
+   nir_assign_var_locations(&nir->shared, &nir->num_shared,
+                            type_size_scalar_bytes);
+   nir_lower_io(nir, nir_var_shared, type_size_scalar_bytes);
  }
  
-#define _OPT(do_pass) (({                                            \
-   bool this_progress = true;                                        \
-   do_pass                                                           \
-   nir_validate_shader(nir);                                         \
-   if (should_clone_nir()) {                                         \
-      nir_shader *clone = nir_shader_clone(ralloc_parent(nir), nir); \
-      ralloc_free(nir);                                              \
-      nir = clone;                                                   \
-   }                                                                 \
-   this_progress;                                                    \
-}))
-
-#define OPT(pass, ...) _OPT(                   \
-   nir_metadata_set_validation_flag(nir);      \
-   this_progress = pass(nir ,##__VA_ARGS__);   \
-   if (this_progress) {                        \
-      progress = true;                         \
-      nir_metadata_check_validation_flag(nir); \
-   }                                           \
-)
-
-#define OPT_V(pass, ...) _OPT( \
-   pass(nir, ##__VA_ARGS__);   \
-)
+#define OPT(pass, ...) ({                                  \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress)                                      \
+      progress = true;                                     \
+   this_progress;                                          \
+})
+
+#define OPT_V(pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
  
  static nir_shader *
  nir_optimize(nir_shader *nir, bool is_scalar)
@@ -323,42 +492,26 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar)
     /* Get rid of split copies */
     nir = nir_optimize(nir, is_scalar);
  
-   OPT(nir_remove_dead_variables);
+   OPT(nir_remove_dead_variables, nir_var_local);
  
     return nir;
  }
  
-/* Lowers inputs, outputs, uniforms, and samplers for i965
- *
- * This function does all of the standard lowering prior to post-processing.
- * The lowering done is highly gen, stage, and backend-specific.  The
- * shader_prog parameter is optional and is used only for lowering sampler
- * derefs and atomics for GLSL shaders.
- */
+/** Lower input and output loads and stores for i965. */
  nir_shader *
-brw_lower_nir(nir_shader *nir,
-              const struct brw_device_info *devinfo,
-              const struct gl_shader_program *shader_prog,
-              bool is_scalar)
+brw_nir_lower_io(nir_shader *nir,
+                 const struct brw_device_info *devinfo,
+                 bool is_scalar)
  {
     bool progress; /* Written by OPT and OPT_V */
     (void)progress;
  
     OPT_V(brw_nir_lower_inputs, devinfo, is_scalar);
-   OPT_V(brw_nir_lower_outputs, is_scalar);
-   OPT_V(brw_nir_lower_uniforms, is_scalar);
+   OPT_V(brw_nir_lower_outputs, devinfo, is_scalar);
+   if (nir->stage == MESA_SHADER_COMPUTE)
+      OPT_V(brw_nir_lower_shared);
     OPT_V(nir_lower_io, nir_var_all, is_scalar ? type_size_scalar : type_size_vec4);
  
-   if (shader_prog) {
-      OPT_V(nir_lower_samplers, shader_prog);
-   }
-
-   OPT(nir_lower_system_values);
-
-   if (shader_prog) {
-      OPT_V(nir_lower_atomics, shader_prog);
-   }
-
     return nir_optimize(nir, is_scalar);
  }
  
@@ -395,9 +548,9 @@ brw_postprocess_nir(nir_shader *nir,
  
     if (unlikely(debug_enabled)) {
        /* Re-index SSA defs so we print more sensible numbers. */
-      nir_foreach_overload(nir, overload) {
-         if (overload->impl)
-            nir_index_ssa_defs(overload->impl);
+      nir_foreach_function(nir, function) {
+         if (function->impl)
+            nir_index_ssa_defs(function->impl);
        }
  
        fprintf(stderr, "NIR (SSA form) for %s shader:\n",
@@ -457,7 +610,19 @@ brw_create_nir(struct brw_context *brw,
     (void)progress;
  
     nir = brw_preprocess_nir(nir, is_scalar);
-   nir = brw_lower_nir(nir, devinfo, shader_prog, is_scalar);
+
+   OPT(nir_lower_system_values);
+   OPT_V(brw_nir_lower_uniforms, is_scalar);
+
+   if (shader_prog) {
+      OPT_V(nir_lower_samplers, shader_prog);
+      OPT_V(nir_lower_atomics, shader_prog);
+   }
+
+   if (nir->stage != MESA_SHADER_TESS_CTRL &&
+       nir->stage != MESA_SHADER_TESS_EVAL) {
+      nir = brw_nir_lower_io(nir, devinfo, is_scalar);
+   }
  
     return nir;
  }
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h

index 0a8a5a2..78b139b 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -82,10 +82,9 @@ nir_shader *brw_create_nir(struct brw_context *brw,
                             bool is_scalar);
  
  nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar);
-nir_shader *brw_lower_nir(nir_shader *nir,
-                          const struct brw_device_info *devinfo,
-                          const struct gl_shader_program *shader_prog,
-                          bool is_scalar);
+nir_shader *brw_nir_lower_io(nir_shader *nir,
+                            const struct brw_device_info *devinfo,
+                            bool is_scalar);
  nir_shader *brw_postprocess_nir(nir_shader *nir,
                                  const struct brw_device_info *devinfo,
                                  bool is_scalar);
diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c

index c995d2b..56e15ef 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -109,9 +109,6 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
           uint8_t resolve_status;
           nir_alu_instr *alu = nir_instr_as_alu(instr);
           switch (alu->op) {
-         case nir_op_bany2:
-         case nir_op_bany3:
-         case nir_op_bany4:
           case nir_op_ball_fequal2:
           case nir_op_ball_iequal2:
           case nir_op_ball_fequal3:
@@ -263,7 +260,8 @@ analyze_boolean_resolves_impl(nir_function_impl *impl)
  void
  brw_nir_analyze_boolean_resolves(nir_shader *shader)
  {
-   nir_foreach_overload(shader, overload)
-      if (overload->impl)
-         analyze_boolean_resolves_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         analyze_boolean_resolves_impl(function->impl);
+   }
  }
diff --git a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c

index 5603129..5ff2cba 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
@@ -290,9 +290,9 @@ brw_nir_opt_peephole_ffma(nir_shader *shader)
  {
     bool progress = false;
  
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress |= brw_nir_opt_peephole_ffma_impl(overload->impl);
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress |= brw_nir_opt_peephole_ffma_impl(function->impl);
     }
  
     return progress;
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c

index ae3d818..6c636d2 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -97,7 +97,8 @@ void
  brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
  {
     if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
+      if (brw->gen == 8)
+         gen8_add_cs_stall_workaround_bits(&flags);
  
        BEGIN_BATCH(6);
        OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
@@ -141,7 +142,8 @@ brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
                              uint32_t imm_lower, uint32_t imm_upper)
  {
     if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
+      if (brw->gen == 8)
+         gen8_add_cs_stall_workaround_bits(&flags);
  
        BEGIN_BATCH(6);
        OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c

index 20d4e0d..94ceb52 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -279,7 +279,7 @@ brw_get_scratch_bo(struct brw_context *brw,
  
  void brwInitFragProgFuncs( struct dd_function_table *functions )
  {
-   assert(functions->ProgramStringNotify == _tnl_program_string);
+   /* assert(functions->ProgramStringNotify == _tnl_program_string); */
  
     functions->NewProgram = brwNewProgram;
     functions->DeleteProgram = brwDeleteProgram;
diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h

index 339b8e1..059ccf8 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_program.h
+++ b/src/mesa/drivers/dri/i965/brw_program.h
@@ -56,6 +56,11 @@ void
  brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
              struct gl_shader *shader, struct gl_program *prog);
  
+void brw_upload_tcs_prog(struct brw_context *brw,
+                         uint64_t per_vertex_slots, uint32_t per_patch_slots);
+void brw_upload_tes_prog(struct brw_context *brw,
+                         uint64_t per_vertex_slots, uint32_t per_patch_slots);
+
  #ifdef __cplusplus
  } /* extern "C" */
  #endif
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h

index fa912c9..a2a4a40 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -84,6 +84,7 @@ struct brw_device_info;
  #define BRW_SWIZZLE_YZXW      BRW_SWIZZLE4(1,2,0,3)
  #define BRW_SWIZZLE_ZXYW      BRW_SWIZZLE4(2,0,1,3)
  #define BRW_SWIZZLE_ZWZW      BRW_SWIZZLE4(2,3,2,3)
+#define BRW_SWIZZLE_WZYX      BRW_SWIZZLE4(3,2,1,0)
  
  static inline bool
  brw_is_single_value_swizzle(unsigned swiz)
@@ -286,33 +287,6 @@ type_sz(unsigned type)
     }
  }
  
-static inline bool
-type_is_signed(unsigned type)
-{
-   switch(type) {
-   case BRW_REGISTER_TYPE_D:
-   case BRW_REGISTER_TYPE_W:
-   case BRW_REGISTER_TYPE_F:
-   case BRW_REGISTER_TYPE_B:
-   case BRW_REGISTER_TYPE_V:
-   case BRW_REGISTER_TYPE_VF:
-   case BRW_REGISTER_TYPE_DF:
-   case BRW_REGISTER_TYPE_HF:
-   case BRW_REGISTER_TYPE_Q:
-      return true;
-
-   case BRW_REGISTER_TYPE_UD:
-   case BRW_REGISTER_TYPE_UW:
-   case BRW_REGISTER_TYPE_UB:
-   case BRW_REGISTER_TYPE_UV:
-   case BRW_REGISTER_TYPE_UQ:
-      return false;
-
-   default:
-      unreachable("not reached");
-   }
-}
-
  /**
   * Construct a brw_reg.
   * \param file      one of the BRW_x_REGISTER_FILE values
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c

index 3f29e2f..d181468 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -654,7 +654,7 @@ const struct brw_tracked_state brw_gs_samplers = {
  static void
  brw_upload_tcs_samplers(struct brw_context *brw)
  {
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
     if (!tcs)
        return;
@@ -667,7 +667,7 @@ const struct brw_tracked_state brw_tcs_samplers = {
     .dirty = {
        .mesa = _NEW_TEXTURE,
        .brw = BRW_NEW_BATCH |
-             BRW_NEW_TESS_CTRL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
     },
     .emit = brw_upload_tcs_samplers,
  };
@@ -676,7 +676,7 @@ const struct brw_tracked_state brw_tcs_samplers = {
  static void
  brw_upload_tes_samplers(struct brw_context *brw)
  {
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
     if (!tes)
        return;
@@ -689,7 +689,7 @@ const struct brw_tracked_state brw_tes_samplers = {
     .dirty = {
        .mesa = _NEW_TEXTURE,
        .brw = BRW_NEW_BATCH |
-             BRW_NEW_TESS_EVAL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
     },
     .emit = brw_upload_tes_samplers,
  };
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp

index 7a6751b..fec96ba 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -24,8 +24,9 @@
  #include "brw_context.h"
  #include "brw_cfg.h"
  #include "brw_eu.h"
+#include "brw_fs.h"
  #include "brw_nir.h"
-#include "glsl/glsl_parser_extras.h"
+#include "brw_vec4_tes.h"
  #include "main/shaderobj.h"
  #include "main/uniforms.h"
  #include "util/debug.h"
@@ -84,6 +85,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
  
     compiler->scalar_stage[MESA_SHADER_VERTEX] =
        devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
+   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
     compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
        devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
     compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
@@ -92,12 +96,22 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
     nir_shader_compiler_options *nir_options =
        rzalloc(compiler, nir_shader_compiler_options);
     nir_options->native_integers = true;
+   nir_options->vertex_id_zero_based = true;
+   nir_options->lower_fdiv = true;
     /* In order to help allow for better CSE at the NIR level we tell NIR
      * to split all ffma instructions during opt_algebraic and we then
      * re-combine them as a later step.
      */
     nir_options->lower_ffma = true;
     nir_options->lower_sub = true;
+   nir_options->lower_fdiv = true;
+   nir_options->lower_scmp = true;
+   nir_options->lower_fmod = true;
+   nir_options->lower_bitfield_extract = true;
+   nir_options->lower_bitfield_insert = true;
+   nir_options->lower_uadd_carry = true;
+   nir_options->lower_usub_borrow = true;
+
     /* In the vec4 backend, our dpN instruction replicates its result to all
      * the components of a vec4.  We would like NIR to give us replicated fdot
      * instructions because it can optimize better for us.
@@ -135,6 +149,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
        compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
     }
  
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+
     if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
        compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
  
@@ -198,6 +215,7 @@ brw_type_for_base_type(const struct glsl_type *type)
     case GLSL_TYPE_ERROR:
     case GLSL_TYPE_INTERFACE:
     case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_FUNCTION:
        unreachable("not reached");
     }
  
@@ -545,6 +563,33 @@ brw_instruction_name(enum opcode op)
        return "mulh";
     case SHADER_OPCODE_MOV_INDIRECT:
        return "mov_indirect";
+
+   case VEC4_OPCODE_URB_READ:
+      return "urb_read";
+   case TCS_OPCODE_GET_INSTANCE_ID:
+      return "tcs_get_instance_id";
+   case TCS_OPCODE_URB_WRITE:
+      return "tcs_urb_write";
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+      return "tcs_set_input_urb_offsets";
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return "tcs_set_output_urb_offsets";
+   case TCS_OPCODE_GET_PRIMITIVE_ID:
+      return "tcs_get_primitive_id";
+   case TCS_OPCODE_CREATE_BARRIER_HEADER:
+      return "tcs_create_barrier_header";
+   case TCS_OPCODE_SRC0_010_IS_ZERO:
+      return "tcs_src0<0,1,0>_is_zero";
+   case TCS_OPCODE_RELEASE_INPUT:
+      return "tcs_release_input";
+   case TCS_OPCODE_THREAD_END:
+      return "tcs_thread_end";
+   case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+      return "tes_create_input_read_header";
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+      return "tes_add_indirect_urb_offset";
+   case TES_OPCODE_GET_PRIMITIVE_ID:
+      return "tes_get_primitive_id";
     }
  
     unreachable("not reached");
@@ -980,6 +1025,8 @@ backend_instruction::has_side_effects() const
     case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
     case FS_OPCODE_FB_WRITE:
     case SHADER_OPCODE_BARRIER:
+   case TCS_OPCODE_URB_WRITE:
+   case TCS_OPCODE_RELEASE_INPUT:
        return true;
     default:
        return false;
@@ -1289,3 +1336,109 @@ gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx)
     }
  }
  
+extern "C" const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tes_prog_key *key,
+                struct brw_tes_prog_data *prog_data,
+                const nir_shader *src_shader,
+                struct gl_shader_program *shader_prog,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str)
+{
+   const struct brw_device_info *devinfo = compiler->devinfo;
+   struct gl_shader *shader =
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
+   nir->info.inputs_read = key->inputs_read;
+   nir->info.patch_inputs_read = key->patch_inputs_read;
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
+   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
+
+   brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
+                       nir->info.outputs_written,
+                       nir->info.separate_shader);
+
+   unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, "DS outputs exceed maximum size");
+      return NULL;
+   }
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   struct brw_vue_map input_vue_map;
+   brw_compute_tess_vue_map(&input_vue_map,
+                            nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
+                            nir->info.patch_inputs_read);
+
+   bool need_patch_header = nir->info.system_values_read &
+      (BITFIELD64_BIT(SYSTEM_VALUE_TESS_LEVEL_OUTER) |
+       BITFIELD64_BIT(SYSTEM_VALUE_TESS_LEVEL_INNER));
+
+   /* The TES will pull most inputs using URB read messages.
+    *
+    * However, we push the patch header for TessLevel factors when required,
+    * as it's a tiny amount of extra data.
+    */
+   prog_data->base.urb_read_length = need_patch_header ? 1 : 0;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
+      fprintf(stderr, "TES Input ");
+      brw_print_vue_map(stderr, &input_vue_map);
+      fprintf(stderr, "TES Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map);
+   }
+
+   if (is_scalar) {
+      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
+                   &prog_data->base.base, shader->Program, nir, 8,
+                   shader_time_index, &input_vue_map);
+      if (!v.run_tes()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+                     &prog_data->base.base, v.promoted_constants, false,
+                     MESA_SHADER_TESS_EVAL);
+      if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
+         g.enable_debug(ralloc_asprintf(mem_ctx,
+                                        "%s tessellation evaluation shader %s",
+                                        nir->info.label ? nir->info.label
+                                                        : "unnamed",
+                                        nir->info.name));
+      }
+
+      g.generate_code(v.cfg, 8);
+
+      return g.get_assembly(final_assembly_size);
+   } else {
+      brw::vec4_tes_visitor v(compiler, log_data, key, prog_data,
+                             nir, mem_ctx, shader_time_index);
+      if (!v.run()) {
+        if (error_str)
+           *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+        return NULL;
+      }
+
+      if (unlikely(INTEL_DEBUG & DEBUG_TES))
+        v.dump_instructions();
+
+      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+                                       &prog_data->base, v.cfg,
+                                       final_assembly_size);
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h

index 8c5778f..82374a4 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -259,9 +259,6 @@ struct brw_gs_compile
     unsigned control_data_header_size_bits;
  };
  
-struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
-
  void
  brw_assign_common_binding_table_offsets(gl_shader_stage stage,
                                          const struct brw_device_info *devinfo,
@@ -273,6 +270,12 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage,
  bool brw_vs_precompile(struct gl_context *ctx,
                         struct gl_shader_program *shader_prog,
                         struct gl_program *prog);
+bool brw_tcs_precompile(struct gl_context *ctx,
+                        struct gl_shader_program *shader_prog,
+                        struct gl_program *prog);
+bool brw_tes_precompile(struct gl_context *ctx,
+                        struct gl_shader_program *shader_prog,
+                        struct gl_program *prog);
  bool brw_gs_precompile(struct gl_context *ctx,
                         struct gl_shader_program *shader_prog,
                         struct gl_program *prog);
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c

index 3d3a6cf..4666788 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -319,10 +319,13 @@ dump_gen8_surface_state(struct brw_context *brw, uint32_t offset, int index)
               GET_FIELD(surf[4], GEN7_SURFACE_MIN_ARRAY_ELEMENT),
               GET_FIELD(surf[4], GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT) + 1,
               1 << GET_BITS(surf[4], 5, 3));
-   batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d\n",
+   batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d,"
+             " tr_mode (gen9+): %d, mip tail (gen9+): %d\n",
               GET_FIELD(surf[5], BRW_SURFACE_X_OFFSET),
               GET_FIELD(surf[5], BRW_SURFACE_Y_OFFSET),
-             GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD));
+             GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD),
+             GET_FIELD(surf[5], GEN9_SURFACE_TRMODE),
+             GET_FIELD(surf[5], GEN9_SURFACE_MIP_TAIL_START_LOD));
     batch_out(brw, name, offset, 6, "AUX pitch: %d qpitch: %d\n",
               GET_FIELD(surf[6], GEN8_SURFACE_AUX_QPITCH) << 2,
               GET_FIELD(surf[6], GEN8_SURFACE_AUX_PITCH) << 2);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c

index cf3cf97..2a671a5 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -196,10 +196,14 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
     &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Haswell */
  
     &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+   &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
+   &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
     &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
     &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
  
     &gen6_vs_push_constants, /* Before vs_state */
+   &gen7_tcs_push_constants,
+   &gen7_tes_push_constants,
     &gen6_gs_push_constants, /* Before gs_state */
     &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
  
@@ -209,6 +213,12 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
     &brw_vs_pull_constants,
     &brw_vs_ubo_surfaces,
     &brw_vs_abo_surfaces,
+   &brw_tcs_pull_constants,
+   &brw_tcs_ubo_surfaces,
+   &brw_tcs_abo_surfaces,
+   &brw_tes_pull_constants,
+   &brw_tes_ubo_surfaces,
+   &brw_tes_abo_surfaces,
     &brw_gs_pull_constants,
     &brw_gs_ubo_surfaces,
     &brw_gs_abo_surfaces,
@@ -218,11 +228,15 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
     &gen6_renderbuffer_surfaces,
     &brw_texture_surfaces,
     &brw_vs_binding_table,
+   &brw_tcs_binding_table,
+   &brw_tes_binding_table,
     &brw_gs_binding_table,
     &brw_wm_binding_table,
  
     &brw_fs_samplers,
     &brw_vs_samplers,
+   &brw_tcs_samplers,
+   &brw_tes_samplers,
     &brw_gs_samplers,
     &gen6_multisample_state,
  
@@ -517,6 +531,7 @@ void brw_init_state( struct brw_context *brw )
     ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER;
     ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER;
     ctx->DriverFlags.NewImageUnits = BRW_NEW_IMAGE_UNITS;
+   ctx->DriverFlags.NewDefaultTessLevels = BRW_NEW_DEFAULT_TESS_LEVELS;
  }
  
  
@@ -607,8 +622,7 @@ static struct dirty_bit_map brw_bits[] = {
     DEFINE_BIT(BRW_NEW_URB_FENCE),
     DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM),
     DEFINE_BIT(BRW_NEW_GEOMETRY_PROGRAM),
-   DEFINE_BIT(BRW_NEW_TESS_EVAL_PROGRAM),
-   DEFINE_BIT(BRW_NEW_TESS_CTRL_PROGRAM),
+   DEFINE_BIT(BRW_NEW_TESS_PROGRAMS),
     DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM),
     DEFINE_BIT(BRW_NEW_CURBE_OFFSETS),
     DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE),
@@ -620,6 +634,7 @@ static struct dirty_bit_map brw_bits[] = {
     DEFINE_BIT(BRW_NEW_BINDING_TABLE_POINTERS),
     DEFINE_BIT(BRW_NEW_INDICES),
     DEFINE_BIT(BRW_NEW_VERTICES),
+   DEFINE_BIT(BRW_NEW_DEFAULT_TESS_LEVELS),
     DEFINE_BIT(BRW_NEW_BATCH),
     DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
     DEFINE_BIT(BRW_NEW_VS_CONSTBUF),
@@ -673,11 +688,40 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map)
  }
  
  static inline void
+brw_upload_tess_programs(struct brw_context *brw)
+{
+   if (brw->tess_eval_program) {
+      uint64_t per_vertex_slots = brw->tess_eval_program->Base.InputsRead;
+      uint32_t per_patch_slots =
+         brw->tess_eval_program->Base.PatchInputsRead;
+
+      /* The TCS may have additional outputs which aren't read by the
+       * TES (possibly for cross-thread communication).  These need to
+       * be stored in the Patch URB Entry as well.
+       */
+      if (brw->tess_ctrl_program) {
+         per_vertex_slots |= brw->tess_ctrl_program->Base.OutputsWritten;
+         per_patch_slots |=
+            brw->tess_ctrl_program->Base.PatchOutputsWritten;
+      }
+
+      brw_upload_tcs_prog(brw, per_vertex_slots, per_patch_slots);
+      brw_upload_tes_prog(brw, per_vertex_slots, per_patch_slots);
+   } else {
+      brw->tcs.prog_data = NULL;
+      brw->tcs.base.prog_data = NULL;
+      brw->tes.prog_data = NULL;
+      brw->tes.base.prog_data = NULL;
+   }
+}
+
+static inline void
  brw_upload_programs(struct brw_context *brw,
                      enum brw_pipeline pipeline)
  {
     if (pipeline == BRW_RENDER_PIPELINE) {
        brw_upload_vs_prog(brw);
+      brw_upload_tess_programs(brw);
  
        if (brw->gen < 6)
           brw_upload_ff_gs_prog(brw);
@@ -691,6 +735,8 @@ brw_upload_programs(struct brw_context *brw,
        bool old_separate = brw->vue_map_geom_out.separate;
        if (brw->geometry_program)
           brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map;
+      else if (brw->tess_eval_program)
+         brw->vue_map_geom_out = brw->tes.prog_data->base.vue_map;
        else
           brw->vue_map_geom_out = brw->vs.prog_data->base.vue_map;
  
@@ -750,12 +796,12 @@ brw_upload_pipeline_state(struct brw_context *brw,
  
        if (brw->tess_eval_program != ctx->TessEvalProgram._Current) {
           brw->tess_eval_program = ctx->TessEvalProgram._Current;
-         brw->ctx.NewDriverState |= BRW_NEW_TESS_EVAL_PROGRAM;
+         brw->ctx.NewDriverState |= BRW_NEW_TESS_PROGRAMS;
        }
  
        if (brw->tess_ctrl_program != ctx->TessCtrlProgram._Current) {
           brw->tess_ctrl_program = ctx->TessCtrlProgram._Current;
-         brw->ctx.NewDriverState |= BRW_NEW_TESS_CTRL_PROGRAM;
+         brw->ctx.NewDriverState |= BRW_NEW_TESS_PROGRAMS;
        }
  
        if (brw->geometry_program != ctx->GeometryProgram._Current) {
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c

index 7bc8b8b..56813bf 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -25,21 +25,8 @@
  #include "brw_context.h"
  #include "brw_state.h"
  #include "brw_defines.h"
-
-struct surface_format_info {
-   bool exists;
-   int sampling;
-   int filtering;
-   int shadow_compare;
-   int chroma_key;
-   int render_target;
-   int alpha_blend;
-   int input_vb;
-   int streamed_output_vb;
-   int color_processing;
-   int lossless_compression;
-   const char *name;
-};
+#include "brw_wm.h"
+#include "brw_surface_formats.h"
  
  /* This macro allows us to write the table almost as it appears in the PRM,
   * while restructuring it to turn it into the C code we want.
@@ -86,7 +73,7 @@ struct surface_format_info {
   * - VOL4_Part1 section 3.9.11 Render Target Write.
   * - Render Target Surface Types [SKL+]
   */
-const struct surface_format_info surface_formats[] = {
+const struct brw_surface_format_info surface_formats[] = {
  /* smpl filt shad CK  RT  AB  VB  SO  color ccs_e */
     SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x,   90,   R32G32B32A32_FLOAT)
     SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   90,   R32G32B32A32_SINT)
@@ -618,7 +605,7 @@ brw_init_surface_formats(struct brw_context *brw)
  
     for (format = MESA_FORMAT_NONE + 1; format < MESA_FORMAT_COUNT; format++) {
        uint32_t texture, render;
-      const struct surface_format_info *rinfo, *tinfo;
+      const struct brw_surface_format_info *rinfo, *tinfo;
        bool is_integer = _mesa_is_format_integer_color(format);
  
        render = texture = brw_format_for_mesa_format(format);
@@ -736,6 +723,34 @@ brw_init_surface_formats(struct brw_context *brw)
     if (brw->gen >= 8)
        ctx->TextureFormatSupported[MESA_FORMAT_Z_UNORM16] = true;
  
+   /* The RGBX formats are not renderable. Normally these get mapped
+    * internally to RGBA formats when rendering. However on Gen9+ when this
+    * internal override is used fast clears don't work so they are disabled in
+    * brw_meta_fast_clear. To avoid this problem we can just pretend not to
+    * support RGBX formats at all. This will cause the upper layers of Mesa to
+    * pick the RGBA formats instead. This works fine because when it is used
+    * as a texture source the swizzle state is programmed to force the alpha
+    * channel to 1.0 anyway. We could also do this for all gens except that
+    * it's a bit more difficult when the hardware doesn't support texture
+    * swizzling. Gens using the blorp have further problems because that
+    * doesn't implement this swizzle override. We don't need to do this for
+    * BGRX because that actually is supported natively on Gen8+.
+    */
+   if (brw->gen >= 9) {
+      static const mesa_format rgbx_formats[] = {
+         MESA_FORMAT_R8G8B8X8_UNORM,
+         MESA_FORMAT_R8G8B8X8_SRGB,
+         MESA_FORMAT_RGBX_UNORM16,
+         MESA_FORMAT_RGBX_FLOAT16,
+         MESA_FORMAT_RGBX_FLOAT32
+      };
+
+      for (int i = 0; i < ARRAY_SIZE(rgbx_formats); i++) {
+         ctx->TextureFormatSupported[rgbx_formats[i]] = false;
+         brw->format_supported_as_render_target[rgbx_formats[i]] = false;
+      }
+   }
+
     /* On hardware that lacks support for ETC1, we map ETC1 to RGBX
      * during glCompressedTexImage2D(). See intel_mipmap_tree::wraps_etc1.
      */
@@ -799,7 +814,7 @@ bool
  brw_losslessly_compressible_format(struct brw_context *brw,
                                     uint32_t brw_format)
  {
-   const struct surface_format_info * const sinfo =
+   const struct brw_surface_format_info * const sinfo =
        &surface_formats[brw_format];
     const int gen = brw->gen * 10;
  
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.h b/src/mesa/drivers/dri/i965/brw_surface_formats.h

new file mode 100644 (file)

index 0000000..a5cd49f
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+struct brw_surface_format_info {
+   bool exists;
+   int sampling;
+   int filtering;
+   int shadow_compare;
+   int chroma_key;
+   int render_target;
+   int alpha_blend;
+   int input_vb;
+   int streamed_output_vb;
+   int color_processing;
+   int lossless_compression;
+   const char *name;
+};
+
+extern const struct brw_surface_format_info surface_formats[];
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c

new file mode 100644 (file)

index 0000000..7e41426
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_tcs.c
+ *
+ * Tessellation control shader state upload code.
+ */
+
+#include "brw_context.h"
+#include "brw_nir.h"
+#include "brw_program.h"
+#include "brw_shader.h"
+#include "brw_state.h"
+#include "program/prog_parameter.h"
+
+static void
+brw_tcs_debug_recompile(struct brw_context *brw,
+                       struct gl_shader_program *shader_prog,
+                       const struct brw_tcs_prog_key *key)
+{
+   struct brw_cache_item *c = NULL;
+   const struct brw_tcs_prog_key *old_key = NULL;
+   bool found = false;
+
+   perf_debug("Recompiling tessellation control shader for program %d\n",
+              shader_prog->Name);
+
+   for (unsigned int i = 0; i < brw->cache.size; i++) {
+      for (c = brw->cache.items[i]; c; c = c->next) {
+         if (c->cache_id == BRW_CACHE_TCS_PROG) {
+            old_key = c->key;
+
+            if (old_key->program_string_id == key->program_string_id)
+               break;
+         }
+      }
+      if (c)
+         break;
+   }
+
+   if (!c) {
+      perf_debug("  Didn't find previous compile in the shader cache for "
+                 "debug\n");
+      return;
+   }
+
+   found |= key_debug(brw, "input vertices", old_key->input_vertices,
+                      key->input_vertices);
+   found |= key_debug(brw, "outputs written", old_key->outputs_written,
+                      key->outputs_written);
+   found |= key_debug(brw, "patch outputs written", old_key->patch_outputs_written,
+                      key->patch_outputs_written);
+   found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode,
+                      key->tes_primitive_mode);
+   found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
+
+   if (!found) {
+      perf_debug("  Something else\n");
+   }
+}
+
+static bool
+brw_codegen_tcs_prog(struct brw_context *brw,
+                     struct gl_shader_program *shader_prog,
+                     struct brw_tess_ctrl_program *tcp,
+                     struct brw_tcs_prog_key *key)
+{
+   struct gl_context *ctx = &brw->ctx;
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+   struct brw_stage_state *stage_state = &brw->tcs.base;
+   nir_shader *nir;
+   struct brw_tcs_prog_data prog_data;
+   bool start_busy = false;
+   double start_time = 0;
+
+   if (tcp) {
+      nir = tcp->program.Base.nir;
+   } else {
+      /* Create a dummy nir_shader.  We won't actually use NIR code to
+       * generate assembly (it's easier to generate assembly directly),
+       * but the whole compiler assumes one of these exists.
+       */
+      const nir_shader_compiler_options *options =
+         ctx->Const.ShaderCompilerOptions[MESA_SHADER_TESS_CTRL].NirOptions;
+      nir = nir_shader_create(NULL, MESA_SHADER_TESS_CTRL, options);
+      nir->num_uniforms = 2; /* both halves of the patch header */
+      nir->info.outputs_written = key->outputs_written;
+      nir->info.inputs_read = key->outputs_written;
+      nir->info.tcs.vertices_out = key->input_vertices;
+      nir->info.name = ralloc_strdup(nir, "passthrough");
+   }
+
+   memset(&prog_data, 0, sizeof(prog_data));
+
+   /* Allocate the references to the uniforms that will end up in the
+    * prog_data associated with the compiled program, and which will be freed
+    * by the state cache.
+    *
+    * Note: param_count needs to be num_uniform_components * 4, since we add
+    * padding around uniform values below vec4 size, so the worst case is that
+    * every uniform is a float which gets padded to the size of a vec4.
+    */
+   struct gl_shader *tcs = shader_prog ?
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL] : NULL;
+   int param_count = nir->num_uniforms;
+   if (!compiler->scalar_stage[MESA_SHADER_TESS_CTRL])
+      param_count *= 4;
+
+   prog_data.base.base.param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.pull_param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.nr_params = param_count;
+
+   if (tcs) {
+      prog_data.base.base.image_param =
+         rzalloc_array(NULL, struct brw_image_param, tcs->NumImages);
+      prog_data.base.base.nr_image_params = tcs->NumImages;
+
+      brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base,
+                                  &prog_data.base.base, false);
+   } else {
+      /* Upload the Patch URB Header as the first two uniforms.
+       * Do the annoying scrambling so the shader doesn't have to.
+       */
+      const float **param = (const float **) prog_data.base.base.param;
+      static float zero = 0.0f;
+      for (int i = 0; i < 4; i++) {
+         param[7 - i] = &ctx->TessCtrlProgram.patch_default_outer_level[i];
+      }
+
+      if (key->tes_primitive_mode == GL_QUADS) {
+         param[3] = &ctx->TessCtrlProgram.patch_default_inner_level[0];
+         param[2] = &ctx->TessCtrlProgram.patch_default_inner_level[1];
+         param[1] = &zero;
+         param[0] = &zero;
+      } else if (key->tes_primitive_mode == GL_TRIANGLES) {
+         param[4] = &ctx->TessCtrlProgram.patch_default_inner_level[0];
+         for (int i = 0; i < 4; i++)
+            param[i] = &zero;
+      }
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS) && tcs)
+      brw_dump_ir("tessellation control", shader_prog, tcs, NULL);
+
+   int st_index = -1;
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TCS);
+
+   if (unlikely(brw->perf_debug)) {
+      start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo);
+      start_time = get_time();
+   }
+
+   void *mem_ctx = ralloc_context(NULL);
+   unsigned program_size;
+   char *error_str;
+   const unsigned *program =
+      brw_compile_tcs(compiler, brw, mem_ctx, key, &prog_data, nir, st_index,
+                      &program_size, &error_str);
+   if (program == NULL) {
+      if (shader_prog) {
+         shader_prog->LinkStatus = false;
+         ralloc_strcat(&shader_prog->InfoLog, error_str);
+      } else {
+         ralloc_free(nir);
+      }
+
+      _mesa_problem(NULL, "Failed to compile tessellation control shader: "
+                    "%s\n", error_str);
+
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (unlikely(brw->perf_debug)) {
+      struct brw_shader *btcs = (struct brw_shader *) tcs;
+      if (btcs->compiled_once) {
+         brw_tcs_debug_recompile(brw, shader_prog, key);
+      }
+      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
+         perf_debug("TCS compile took %.03f ms and stalled the GPU\n",
+                    (get_time() - start_time) * 1000);
+      }
+      btcs->compiled_once = true;
+   }
+
+   /* Scratch space is used for register spilling */
+   if (prog_data.base.base.total_scratch) {
+      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
+                        prog_data.base.base.total_scratch *
+                         brw->max_hs_threads);
+   }
+
+   brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG,
+                    key, sizeof(*key),
+                    program, program_size,
+                    &prog_data, sizeof(prog_data),
+                    &stage_state->prog_offset, &brw->tcs.prog_data);
+   ralloc_free(mem_ctx);
+   if (!tcs)
+      ralloc_free(nir);
+
+   return true;
+}
+
+
+void
+brw_upload_tcs_prog(struct brw_context *brw,
+                    uint64_t per_vertex_slots,
+                    uint32_t per_patch_slots)
+{
+   struct gl_context *ctx = &brw->ctx;
+   struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
+   struct brw_stage_state *stage_state = &brw->tcs.base;
+   struct brw_tcs_prog_key key;
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct brw_tess_ctrl_program *tcp =
+      (struct brw_tess_ctrl_program *) brw->tess_ctrl_program;
+   struct brw_tess_eval_program *tep =
+      (struct brw_tess_eval_program *) brw->tess_eval_program;
+   assert(tep);
+
+   if (!brw_state_dirty(brw,
+                        _NEW_TEXTURE,
+                        BRW_NEW_PATCH_PRIMITIVE |
+                        BRW_NEW_TESS_PROGRAMS))
+      return;
+
+   struct gl_program *prog = &tcp->program.Base;
+
+   memset(&key, 0, sizeof(key));
+
+   key.input_vertices = ctx->TessCtrlProgram.patch_vertices;
+   key.outputs_written = per_vertex_slots;
+   key.patch_outputs_written = per_patch_slots;
+
+   /* We need to specialize our code generation for tessellation levels
+    * based on the domain the DS is expecting to tessellate.
+    */
+   key.tes_primitive_mode = tep->program.PrimitiveMode;
+
+   if (tcp) {
+      key.program_string_id = tcp->id;
+
+      /* _NEW_TEXTURE */
+      brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
+                                         &key.tex);
+   } else {
+      key.outputs_written = tep->program.Base.InputsRead;
+   }
+
+
+   if (!brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG,
+                         &key, sizeof(key),
+                         &stage_state->prog_offset, &brw->tcs.prog_data)) {
+      bool success = brw_codegen_tcs_prog(brw, current[MESA_SHADER_TESS_CTRL],
+                                          tcp, &key);
+      assert(success);
+      (void)success;
+   }
+   brw->tcs.base.prog_data = &brw->tcs.prog_data->base.base;
+}
+
+
+bool
+brw_tcs_precompile(struct gl_context *ctx,
+                   struct gl_shader_program *shader_prog,
+                   struct gl_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_tcs_prog_key key;
+   uint32_t old_prog_offset = brw->tcs.base.prog_offset;
+   struct brw_tcs_prog_data *old_prog_data = brw->tcs.prog_data;
+   bool success;
+
+   struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog;
+   struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp);
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = btcp->id;
+   brw_setup_tex_for_precompile(brw, &key.tex, prog);
+
+   /* Guess that the input and output patches have the same dimensionality. */
+   key.input_vertices = shader_prog->TessCtrl.VerticesOut;
+
+   key.tes_primitive_mode =
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] ?
+      shader_prog->TessEval.PrimitiveMode : GL_TRIANGLES;
+
+   key.outputs_written = prog->OutputsWritten;
+   key.patch_outputs_written = prog->PatchOutputsWritten;
+
+   success = brw_codegen_tcs_prog(brw, shader_prog, btcp, &key);
+
+   brw->tcs.base.prog_offset = old_prog_offset;
+   brw->tcs.prog_data = old_prog_data;
+
+   return success;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c

index 115c5ab..28cef3c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c
@@ -39,7 +39,7 @@ brw_upload_tcs_pull_constants(struct brw_context *brw)
  {
     struct brw_stage_state *stage_state = &brw->tcs.base;
  
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     struct brw_tess_ctrl_program *tcp =
        (struct brw_tess_ctrl_program *) brw->tess_ctrl_program;
  
@@ -59,7 +59,7 @@ const struct brw_tracked_state brw_tcs_pull_constants = {
        .mesa = _NEW_PROGRAM_CONSTANTS,
        .brw = BRW_NEW_BATCH |
               BRW_NEW_TCS_PROG_DATA |
-             BRW_NEW_TESS_CTRL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
     },
     .emit = brw_upload_tcs_pull_constants,
  };
@@ -122,7 +122,7 @@ static void
  brw_upload_tcs_image_surfaces(struct brw_context *brw)
  {
     struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     struct gl_shader_program *prog =
        ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
  
@@ -138,7 +138,7 @@ const struct brw_tracked_state brw_tcs_image_surfaces = {
        .brw = BRW_NEW_BATCH |
               BRW_NEW_TCS_PROG_DATA |
               BRW_NEW_IMAGE_UNITS |
-             BRW_NEW_TESS_CTRL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
     },
     .emit = brw_upload_tcs_image_surfaces,
  };
diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c

new file mode 100644 (file)

index 0000000..27dc7e5
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_tes.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_tes.c
+ *
+ * Tessellation evaluation shader state upload code.
+ */
+
+#include "brw_context.h"
+#include "brw_nir.h"
+#include "brw_program.h"
+#include "brw_shader.h"
+#include "brw_state.h"
+#include "program/prog_parameter.h"
+
+static void
+brw_tes_debug_recompile(struct brw_context *brw,
+                       struct gl_shader_program *shader_prog,
+                       const struct brw_tes_prog_key *key)
+{
+   struct brw_cache_item *c = NULL;
+   const struct brw_tes_prog_key *old_key = NULL;
+   bool found = false;
+
+   perf_debug("Recompiling tessellation evaluation shader for program %d\n",
+              shader_prog->Name);
+
+   for (unsigned int i = 0; i < brw->cache.size; i++) {
+      for (c = brw->cache.items[i]; c; c = c->next) {
+         if (c->cache_id == BRW_CACHE_TES_PROG) {
+            old_key = c->key;
+
+            if (old_key->program_string_id == key->program_string_id)
+               break;
+         }
+      }
+      if (c)
+         break;
+   }
+
+   if (!c) {
+      perf_debug("  Didn't find previous compile in the shader cache for "
+                 "debug\n");
+      return;
+   }
+
+   found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
+   found |= key_debug(brw, "inputs read", old_key->inputs_read,
+                      key->inputs_read);
+   found |= key_debug(brw, "patch inputs read", old_key->patch_inputs_read,
+                      key->patch_inputs_read);
+
+   if (!found) {
+      perf_debug("  Something else\n");
+   }
+}
+
+static bool
+brw_codegen_tes_prog(struct brw_context *brw,
+                     struct gl_shader_program *shader_prog,
+                     struct brw_tess_eval_program *tep,
+                     struct brw_tes_prog_key *key)
+{
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
+   struct brw_stage_state *stage_state = &brw->tes.base;
+   nir_shader *nir = tep->program.Base.nir;
+   struct brw_tes_prog_data prog_data;
+   bool start_busy = false;
+   double start_time = 0;
+
+   memset(&prog_data, 0, sizeof(prog_data));
+
+   brw_assign_common_binding_table_offsets(MESA_SHADER_TESS_EVAL, devinfo,
+                                           shader_prog, &tep->program.Base,
+                                           &prog_data.base.base, 0);
+
+   switch (tep->program.Spacing) {
+   case GL_EQUAL:
+      prog_data.partitioning = BRW_TESS_PARTITIONING_INTEGER;
+      break;
+   case GL_FRACTIONAL_ODD:
+      prog_data.partitioning = BRW_TESS_PARTITIONING_ODD_FRACTIONAL;
+      break;
+   case GL_FRACTIONAL_EVEN:
+      prog_data.partitioning = BRW_TESS_PARTITIONING_EVEN_FRACTIONAL;
+      break;
+   default:
+      unreachable("invalid domain shader spacing");
+   }
+
+   switch (tep->program.PrimitiveMode) {
+   case GL_QUADS:
+      prog_data.domain = BRW_TESS_DOMAIN_QUAD;
+      break;
+   case GL_TRIANGLES:
+      prog_data.domain = BRW_TESS_DOMAIN_TRI;
+      break;
+   case GL_ISOLINES:
+      prog_data.domain = BRW_TESS_DOMAIN_ISOLINE;
+      break;
+   default:
+      unreachable("invalid domain shader primitive mode");
+   }
+
+   if (tep->program.PointMode) {
+      prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+   } else if (tep->program.PrimitiveMode == GL_ISOLINES) {
+      prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_LINE;
+   } else {
+      /* Hardware winding order is backwards from OpenGL */
+      switch (tep->program.VertexOrder) {
+      case GL_CCW:
+         prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW;
+         break;
+      case GL_CW:
+         prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW;
+         break;
+      default:
+         unreachable("invalid domain shader vertex order");
+      }
+   }
+
+   /* Allocate the references to the uniforms that will end up in the
+    * prog_data associated with the compiled program, and which will be freed
+    * by the state cache.
+    *
+    * Note: param_count needs to be num_uniform_components * 4, since we add
+    * padding around uniform values below vec4 size, so the worst case is that
+    * every uniform is a float which gets padded to the size of a vec4.
+    */
+   struct gl_shader *tes = shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
+   int param_count = nir->num_uniforms;
+   if (!compiler->scalar_stage[MESA_SHADER_TESS_EVAL])
+      param_count *= 4;
+
+   prog_data.base.base.param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.pull_param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, tes->NumImages);
+   prog_data.base.base.nr_params = param_count;
+   prog_data.base.base.nr_image_params = tes->NumImages;
+
+   brw_nir_setup_glsl_uniforms(nir, shader_prog, &tep->program.Base,
+                               &prog_data.base.base,
+                               compiler->scalar_stage[MESA_SHADER_TESS_EVAL]);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TES))
+      brw_dump_ir("tessellation evaluation", shader_prog, tes, NULL);
+
+   int st_index = -1;
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TES);
+
+   if (unlikely(brw->perf_debug)) {
+      start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo);
+      start_time = get_time();
+   }
+
+   void *mem_ctx = ralloc_context(NULL);
+   unsigned program_size;
+   char *error_str;
+   const unsigned *program =
+      brw_compile_tes(compiler, brw, mem_ctx, key, &prog_data, nir,
+                      shader_prog, st_index, &program_size, &error_str);
+   if (program == NULL) {
+      if (shader_prog) {
+         shader_prog->LinkStatus = false;
+         ralloc_strcat(&shader_prog->InfoLog, error_str);
+      }
+
+      _mesa_problem(NULL, "Failed to compile tessellation evaluation shader: "
+                    "%s\n", error_str);
+
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (unlikely(brw->perf_debug)) {
+      struct brw_shader *btes = (struct brw_shader *) tes;
+      if (btes->compiled_once) {
+         brw_tes_debug_recompile(brw, shader_prog, key);
+      }
+      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
+         perf_debug("TES compile took %.03f ms and stalled the GPU\n",
+                    (get_time() - start_time) * 1000);
+      }
+      btes->compiled_once = true;
+   }
+
+   /* Scratch space is used for register spilling */
+   if (prog_data.base.base.total_scratch) {
+      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
+                        prog_data.base.base.total_scratch *
+                         brw->max_ds_threads);
+   }
+
+   brw_upload_cache(&brw->cache, BRW_CACHE_TES_PROG,
+                    key, sizeof(*key),
+                    program, program_size,
+                    &prog_data, sizeof(prog_data),
+                    &stage_state->prog_offset, &brw->tes.prog_data);
+   ralloc_free(mem_ctx);
+
+   return true;
+}
+
+
+void
+brw_upload_tes_prog(struct brw_context *brw,
+                    uint64_t per_vertex_slots,
+                    uint32_t per_patch_slots)
+{
+   struct gl_context *ctx = &brw->ctx;
+   struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
+   struct brw_stage_state *stage_state = &brw->tes.base;
+   struct brw_tes_prog_key key;
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct brw_tess_eval_program *tep =
+      (struct brw_tess_eval_program *) brw->tess_eval_program;
+
+   if (!brw_state_dirty(brw,
+                        _NEW_TEXTURE,
+                        BRW_NEW_TESS_PROGRAMS))
+      return;
+
+   struct gl_program *prog = &tep->program.Base;
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = tep->id;
+
+   /* Ignore gl_TessLevelInner/Outer - we treat them as system values,
+    * not inputs, and they're always present in the URB entry regardless
+    * of whether or not we read them.
+    */
+   key.inputs_read = per_vertex_slots &
+      ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
+   key.patch_inputs_read = per_patch_slots;
+
+   /* _NEW_TEXTURE */
+   brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
+                                      &key.tex);
+
+   if (!brw_search_cache(&brw->cache, BRW_CACHE_TES_PROG,
+                         &key, sizeof(key),
+                         &stage_state->prog_offset, &brw->tes.prog_data)) {
+      bool success = brw_codegen_tes_prog(brw, current[MESA_SHADER_TESS_EVAL],
+                                          tep, &key);
+      assert(success);
+      (void)success;
+   }
+   brw->tes.base.prog_data = &brw->tes.prog_data->base.base;
+}
+
+
+bool
+brw_tes_precompile(struct gl_context *ctx,
+                   struct gl_shader_program *shader_prog,
+                   struct gl_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_tes_prog_key key;
+   uint32_t old_prog_offset = brw->tes.base.prog_offset;
+   struct brw_tes_prog_data *old_prog_data = brw->tes.prog_data;
+   bool success;
+
+   struct gl_tess_eval_program *tep = (struct gl_tess_eval_program *)prog;
+   struct brw_tess_eval_program *btep = brw_tess_eval_program(tep);
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = btep->id;
+   key.inputs_read = prog->InputsRead;
+   key.patch_inputs_read = prog->PatchInputsRead;
+
+   if (shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
+      struct gl_program *tcp =
+         shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program;
+      key.inputs_read |= tcp->OutputsWritten;
+      key.patch_inputs_read |= tcp->PatchOutputsWritten;
+   }
+
+   /* Ignore gl_TessLevelInner/Outer - they're system values. */
+   key.inputs_read &= ~(VARYING_BIT_TESS_LEVEL_INNER |
+                        VARYING_BIT_TESS_LEVEL_OUTER);
+
+   brw_setup_tex_for_precompile(brw, &key.tex, prog);
+
+   success = brw_codegen_tes_prog(brw, shader_prog, btep, &key);
+
+   brw->tes.base.prog_offset = old_prog_offset;
+   brw->tes.prog_data = old_prog_data;
+
+   return success;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c

index 142bd5a..eff1740 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c
@@ -39,7 +39,7 @@ brw_upload_tes_pull_constants(struct brw_context *brw)
  {
     struct brw_stage_state *stage_state = &brw->tes.base;
  
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     struct brw_tess_eval_program *dp =
        (struct brw_tess_eval_program *) brw->tess_eval_program;
  
@@ -59,7 +59,7 @@ const struct brw_tracked_state brw_tes_pull_constants = {
        .mesa = _NEW_PROGRAM_CONSTANTS,
        .brw = BRW_NEW_BATCH |
               BRW_NEW_TES_PROG_DATA |
-             BRW_NEW_TESS_EVAL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
     },
     .emit = brw_upload_tes_pull_constants,
  };
@@ -122,7 +122,7 @@ static void
  brw_upload_tes_image_surfaces(struct brw_context *brw)
  {
     struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     struct gl_shader_program *prog =
        ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
  
@@ -137,7 +137,7 @@ const struct brw_tracked_state brw_tes_image_surfaces = {
     .dirty = {
        .brw = BRW_NEW_BATCH |
               BRW_NEW_IMAGE_UNITS |
-             BRW_NEW_TESS_EVAL_PROGRAM |
+             BRW_NEW_TESS_PROGRAMS |
               BRW_NEW_TES_PROG_DATA,
     },
     .emit = brw_upload_tes_image_surfaces,
diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c

index bf7f9c6..934b6b8 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_util.c
+++ b/src/mesa/drivers/dri/i965/brw_util.c
@@ -98,3 +98,31 @@ GLuint brw_translate_blend_factor( GLenum factor )
        unreachable("not reached");
     }
  }
+
+static const GLuint prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = {
+   [GL_POINTS] =_3DPRIM_POINTLIST,
+   [GL_LINES] = _3DPRIM_LINELIST,
+   [GL_LINE_LOOP] = _3DPRIM_LINELOOP,
+   [GL_LINE_STRIP] = _3DPRIM_LINESTRIP,
+   [GL_TRIANGLES] = _3DPRIM_TRILIST,
+   [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
+   [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
+   [GL_QUADS] = _3DPRIM_QUADLIST,
+   [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
+   [GL_POLYGON] = _3DPRIM_POLYGON,
+   [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
+   [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
+   [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
+   [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+};
+
+uint32_t
+get_hw_prim_for_gl_prim(int mode)
+{
+   if (mode >= BRW_PRIM_OFFSET)
+      return mode - BRW_PRIM_OFFSET;
+   else {
+      assert(mode < ARRAY_SIZE(prim_to_hw_prim));
+      return prim_to_hw_prim[mode];
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp

index 1304e23..358a710 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -155,6 +155,10 @@ vec4_instruction::is_send_from_grf()
     case SHADER_OPCODE_TYPED_ATOMIC:
     case SHADER_OPCODE_TYPED_SURFACE_READ:
     case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case VEC4_OPCODE_URB_READ:
+   case TCS_OPCODE_URB_WRITE:
+   case TCS_OPCODE_RELEASE_INPUT:
+   case SHADER_OPCODE_BARRIER:
        return true;
     default:
        return false;
@@ -184,7 +188,10 @@ bool
  vec4_instruction::has_source_and_destination_hazard() const
  {
     switch (opcode) {
-   /* Most opcodes in the vec4 world use MRFs. */
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+      return true;
     default:
        return false;
     }
@@ -204,6 +211,7 @@ vec4_instruction::regs_read(unsigned arg) const
     case SHADER_OPCODE_TYPED_ATOMIC:
     case SHADER_OPCODE_TYPED_SURFACE_READ:
     case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case TCS_OPCODE_URB_WRITE:
        return arg == 0 ? mlen : 1;
  
     case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
@@ -268,6 +276,7 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
     case SHADER_OPCODE_POW:
        return 2;
     case VS_OPCODE_URB_WRITE:
+   case TCS_OPCODE_THREAD_END:
        return 1;
     case VS_OPCODE_PULL_CONSTANT_LOAD:
        return 2;
@@ -281,6 +290,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
        return 0;
     case GS_OPCODE_FF_SYNC:
        return 1;
+   case TCS_OPCODE_URB_WRITE:
+      return 0;
     case SHADER_OPCODE_SHADER_TIME_ADD:
        return 0;
     case SHADER_OPCODE_TEX:
@@ -1549,7 +1560,7 @@ int
  vec4_vs_visitor::setup_attributes(int payload_reg)
  {
     int nr_attributes;
-   int attribute_map[VERT_ATTRIB_MAX + 1];
+   int attribute_map[VERT_ATTRIB_MAX + 2];
     memset(attribute_map, 0, sizeof(attribute_map));
  
     nr_attributes = 0;
@@ -1560,12 +1571,19 @@ vec4_vs_visitor::setup_attributes(int payload_reg)
        }
     }
  
+   if (vs_prog_data->uses_drawid) {
+      attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes;
+      nr_attributes++;
+   }
+
     /* VertexID is stored by the VF as the last vertex element, but we
      * don't represent it with a flag in inputs_read, so we call it
      * VERT_ATTRIB_MAX.
      */
-   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
+   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
+       vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) {
        attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
+      nr_attributes++;
     }
  
     lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
@@ -1758,9 +1776,22 @@ vec4_visitor::convert_to_hw_regs()
           case ATTR:
              unreachable("not reached");
           }
+
           src = reg;
        }
  
+      if (inst->is_3src()) {
+         /* 3-src instructions with scalar sources support arbitrary subnr,
+          * but don't actually use swizzles.  Convert swizzle into subnr.
+          */
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) {
+               assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
+               inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
+            }
+         }
+      }
+
        dst_reg &dst = inst->dst;
        struct brw_reg reg;
  
@@ -1963,11 +1994,18 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
      * incoming vertex attribute.  So, add an extra slot.
      */
     if (shader->info.system_values_read &
-       (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
+       (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
+        BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
+        BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
          BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
        nr_attributes++;
     }
  
+   /* gl_DrawID has its very own vec4 */
+   if (shader->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
+      nr_attributes++;
+   }
+
     /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
      * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
      * vec4 mode, the hardware appears to wedge unless we read something.
@@ -2006,7 +2044,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
  
        fs_generator g(compiler, log_data, mem_ctx, (void *) key,
                       &prog_data->base.base, v.promoted_constants,
-                     v.runtime_check_aads_emit, "VS");
+                     v.runtime_check_aads_emit, MESA_SHADER_VERTEX);
        if (INTEL_DEBUG & DEBUG_VS) {
           const char *debug_name =
              ralloc_asprintf(mem_ctx, "%s vertex shader %s",
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h

index 0dc04ea..83d9eda 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -258,6 +258,7 @@ public:
                       src_reg offset_value,
                       src_reg mcs,
                       bool is_cube_array,
+                     uint32_t surface, src_reg surface_reg,
                       uint32_t sampler, src_reg sampler_reg);
  
     src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
@@ -310,6 +311,8 @@ public:
  
     bool is_high_sampler(src_reg sampler);
  
+   bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
+
     virtual void emit_nir_code();
     virtual void nir_setup_uniforms();
     virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
@@ -337,6 +340,7 @@ public:
                         unsigned num_components = 4);
     src_reg get_nir_src(nir_src src,
                         unsigned num_components = 4);
+   src_reg get_indirect_offset(nir_intrinsic_instr *instr);
  
     virtual dst_reg *make_reg_for_system_value(int location,
                                                const glsl_type *type) = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp

index 85cbf24..0c1f0c3 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -75,6 +75,8 @@ is_expression(const vec4_instruction *const inst)
     case VEC4_OPCODE_UNPACK_UNIFORM:
     case SHADER_OPCODE_FIND_LIVE_CHANNEL:
     case SHADER_OPCODE_BROADCAST:
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
        return true;
     case SHADER_OPCODE_RCP:
     case SHADER_OPCODE_RSQ:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp

index 2d0722a..166bc17 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
@@ -45,6 +45,11 @@ can_do_writemask(const struct brw_device_info *devinfo,
     case VS_OPCODE_PULL_CONSTANT_LOAD:
     case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
     case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+   case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+   case VEC4_OPCODE_URB_READ:
        return false;
     default:
        /* The MATH instruction on Gen6 only executes in align1 mode, which does
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp

index 71a7f63..237534d 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -20,7 +20,6 @@
   * IN THE SOFTWARE.
   */
  
-#include "glsl/glsl_parser_extras.h"
  #include "brw_vec4.h"
  #include "brw_cfg.h"
  #include "brw_eu.h"
@@ -110,6 +109,7 @@ generate_tex(struct brw_codegen *p,
               vec4_instruction *inst,
               struct brw_reg dst,
               struct brw_reg src,
+             struct brw_reg surface_index,
               struct brw_reg sampler_index)
  {
     const struct brw_device_info *devinfo = p->devinfo;
@@ -265,14 +265,16 @@ generate_tex(struct brw_codegen *p,
           ? prog_data->base.binding_table.gather_texture_start
           : prog_data->base.binding_table.texture_start;
  
-   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
+   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+       sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t surface = surface_index.ud;
        uint32_t sampler = sampler_index.ud;
  
        brw_SAMPLE(p,
                   dst,
                   inst->base_mrf,
                   src,
-                 sampler + base_binding_table_index,
+                 surface + base_binding_table_index,
                   sampler % 16,
                   msg_type,
                   1, /* response length */
@@ -286,14 +288,19 @@ generate_tex(struct brw_codegen *p,
        /* Non-constant sampler index. */
  
        struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
        struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
  
        brw_push_insn_state(p);
        brw_set_default_mask_control(p, BRW_MASK_DISABLE);
        brw_set_default_access_mode(p, BRW_ALIGN_1);
  
-      /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
-      brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      if (memcmp(&surface_reg, &sampler_reg, sizeof(surface_reg)) == 0) {
+         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      } else {
+         brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+         brw_OR(p, addr, addr, surface_reg);
+      }
        if (base_binding_table_index)
           brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
        brw_AND(p, addr, addr, brw_imm_ud(0xfff));
@@ -714,6 +721,338 @@ generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
  }
  
  static void
+generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+   const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
+
+   /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
+    *
+    * Since we operate in SIMD4x2 mode, we need run half as many threads
+    * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
+    * shift right by one less to accomplish the multiplication by two.
+    */
+   dst = retype(dst, BRW_REGISTER_TYPE_UD);
+   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
+   const int shift = ivb ? 16 : 17;
+
+   brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
+   brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
+           brw_imm_ud(shift - 1));
+   brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_urb_write(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg urb_header)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, urb_header);
+
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              inst->mlen /* mlen */, 0 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+   if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
+      brw_inst_set_eot(devinfo, send, 1);
+   } else {
+      brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+      brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+   }
+
+   /* what happens to swizzles? */
+}
+
+
+static void
+generate_tcs_input_urb_offsets(struct brw_codegen *p,
+                               struct brw_reg dst,
+                               struct brw_reg vertex,
+                               struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation.
+    * Inputs are a vertex index, and a byte offset from the beginning of
+    * the vertex. */
+
+   /* If `vertex` is not an immediate, we clobber a0.0 */
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
+
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   /* m0.5 bits 8-15 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+   /* m0.0-0.1: URB handles */
+   if (vertex.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t vertex_index = vertex.ud;
+      struct brw_reg index_reg = brw_vec1_grf(
+            1 + (vertex_index >> 3), vertex_index & 7);
+
+      brw_MOV(p, vec2(get_element_ud(dst, 0)),
+              retype(index_reg, BRW_REGISTER_TYPE_UD));
+   } else {
+      /* Use indirect addressing.  ICP Handles are DWords (single channels
+       * of a register) and start at g1.0.
+       *
+       * In order to start our region at g1.0, we add 8 to the vertex index,
+       * effectively skipping over the 8 channels in g0.0.  This gives us a
+       * DWord offset to the ICP Handle.
+       *
+       * Indirect addressing works in terms of bytes, so we then multiply
+       * the DWord offset by 4 (by shifting left by 2).
+       */
+      struct brw_reg addr = brw_address_reg(0);
+
+      /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
+      brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_ud(2));
+      brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
+
+      /* top half: m0.1 = g[1.0 + vertex.4]UD */
+      brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_ud(2));
+      brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
+   }
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+
+static void
+generate_tcs_output_urb_offsets(struct brw_codegen *p,
+                                struct brw_reg dst,
+                                struct brw_reg write_mask,
+                                struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
+
+   assert(write_mask.file == BRW_IMMEDIATE_VALUE);
+   assert(write_mask.type == BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   unsigned mask = write_mask.ud;
+
+   /* m0.5 bits 15:12 and 11:8 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
+
+   /* HS patch URB handle is delivered in r0.0 */
+   struct brw_reg urb_handle = brw_vec1_grf(0, 0);
+
+   /* m0.0-0.1: URB handles */
+   brw_MOV(p, vec2(get_element_ud(dst, 0)),
+           retype(urb_handle, BRW_REGISTER_TYPE_UD));
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tes_create_input_read_header(struct brw_codegen *p,
+                                      struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Initialize the register to 0 */
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   /* Enable all the channels in m0.5 bits 15:8 */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+   /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1.  For safety,
+    * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
+    */
+   brw_AND(p, vec2(get_element_ud(dst, 0)),
+           retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(0x1fff));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
+                                     struct brw_reg dst,
+                                     struct brw_reg header,
+                                     struct brw_reg offset)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   brw_MOV(p, dst, header);
+   /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
+   brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_vec4_urb_read(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg dst,
+                       struct brw_reg header)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+
+   assert(header.file == BRW_GENERAL_REGISTER_FILE);
+   assert(header.type == BRW_REGISTER_TYPE_UD);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              1 /* mlen */, 1 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+}
+
+static void
+generate_tcs_release_input(struct brw_codegen *p,
+                           struct brw_reg header,
+                           struct brw_reg vertex,
+                           struct brw_reg is_unpaired)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD);
+
+   /* m0.0-0.1: URB handles */
+   struct brw_reg urb_handles =
+      retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
+             BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, header, brw_imm_ud(0));
+   brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
+   brw_pop_insn_state(p);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, header);
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              1 /* mlen */, 0 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_complete(devinfo, send, 1);
+   brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
+                                    BRW_URB_SWIZZLE_NONE :
+                                    BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg header = brw_message_reg(inst->base_mrf);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, header, brw_imm_ud(0));
+   brw_MOV(p, get_element_ud(header, 0),
+           retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   brw_pop_insn_state(p);
+
+   brw_urb_WRITE(p,
+                 brw_null_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 header,
+                 BRW_URB_WRITE_EOT | inst->urb_write_flags,
+                 inst->mlen,
+                 0,              /* response len */
+                 0,              /* urb destination offset */
+                 0);
+}
+
+static void
+generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_create_barrier_header(struct brw_codegen *p,
+                                   struct brw_vue_prog_data *prog_data,
+                                   struct brw_reg dst)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+   const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
+   struct brw_reg m0_2 = get_element_ud(dst, 2);
+   unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Zero the message header */
+   brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+
+   /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */
+   brw_AND(p, m0_2,
+           retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
+
+   /* Shift it up to bits 27:24. */
+   brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
+
+   /* Set the Barrier Count and the enable bit */
+   brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
+
+   brw_pop_insn_state(p);
+}
+
+static void
  generate_oword_dual_block_offsets(struct brw_codegen *p,
                                    struct brw_reg m1,
                                    struct brw_reg index)
@@ -1375,7 +1714,7 @@ generate_code(struct brw_codegen *p,
        case SHADER_OPCODE_TG4:
        case SHADER_OPCODE_TG4_OFFSET:
        case SHADER_OPCODE_SAMPLEINFO:
-         generate_tex(p, prog_data, inst, dst, src[0], src[1]);
+         generate_tex(p, prog_data, inst, dst, src[0], src[1], src[2]);
           break;
  
        case VS_OPCODE_URB_WRITE:
@@ -1580,6 +1919,65 @@ generate_code(struct brw_codegen *p,
           break;
        }
  
+      case TCS_OPCODE_URB_WRITE:
+         generate_tcs_urb_write(p, inst, src[0]);
+         break;
+
+      case VEC4_OPCODE_URB_READ:
+         generate_vec4_urb_read(p, inst, dst, src[0]);
+         break;
+
+      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+         generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+         generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_GET_INSTANCE_ID:
+         generate_tcs_get_instance_id(p, dst);
+         break;
+
+      case TCS_OPCODE_GET_PRIMITIVE_ID:
+         generate_tcs_get_primitive_id(p, dst);
+         break;
+
+      case TCS_OPCODE_CREATE_BARRIER_HEADER:
+         generate_tcs_create_barrier_header(p, prog_data, dst);
+         break;
+
+      case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+         generate_tes_create_input_read_header(p, dst);
+         break;
+
+      case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+         generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
+         break;
+
+      case TES_OPCODE_GET_PRIMITIVE_ID:
+         generate_tes_get_primitive_id(p, dst);
+         break;
+
+      case TCS_OPCODE_SRC0_010_IS_ZERO:
+         /* If src_reg had stride like fs_reg, we wouldn't need this. */
+         brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
+         brw_inst_set_cond_modifier(devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+         break;
+
+      case TCS_OPCODE_RELEASE_INPUT:
+         generate_tcs_release_input(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_THREAD_END:
+         generate_tcs_thread_end(p, inst);
+         break;
+
+      case SHADER_OPCODE_BARRIER:
+         brw_barrier(p, src[0]);
+         brw_WAIT(p);
+         break;
+
        case SHADER_OPCODE_MOV_INDIRECT:
           generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
  
@@ -1638,7 +2036,7 @@ generate_code(struct brw_codegen *p,
  
     compiler->shader_debug_log(log_data,
                                "%s vec4 shader: %d inst, %d loops, %u cycles, "
-                              "compacted %d to %d bytes.\n",
+                              "compacted %d to %d bytes.",
                                stage_abbrev, before_size / 16,
                                loop_count, cfg->cycle_count,
                                before_size, after_size);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp

index b13d36e..b2a971a 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -839,7 +839,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
  
           fs_generator g(compiler, log_data, mem_ctx, &c.key,
                          &prog_data->base.base, v.promoted_constants,
-                        false, "GS");
+                        false, MESA_SHADER_GEOMETRY);
           if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
              const char *label =
                 shader->info.label ? shader->info.label : "unnamed";
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp

index bafc9a5..46cbbfa 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -41,10 +41,10 @@ vec4_visitor::emit_nir_code()
     nir_setup_system_values();
  
     /* get the main function and emit it */
-   nir_foreach_overload(nir, overload) {
-      assert(strcmp(overload->function->name, "main") == 0);
-      assert(overload->impl);
-      nir_emit_impl(overload->impl);
+   nir_foreach_function(nir, function) {
+      assert(strcmp(function->name, "main") == 0);
+      assert(function->impl);
+      nir_emit_impl(function->impl);
     }
  }
  
@@ -78,6 +78,20 @@ vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
                                             glsl_type::int_type);
        break;
  
+   case nir_intrinsic_load_base_instance:
+      reg = &nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
+      if (reg->file == BAD_FILE)
+         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_INSTANCE,
+                                           glsl_type::int_type);
+      break;
+
+   case nir_intrinsic_load_draw_id:
+      reg = &nir_system_values[SYSTEM_VALUE_DRAW_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *make_reg_for_system_value(SYSTEM_VALUE_DRAW_ID,
+                                           glsl_type::int_type);
+      break;
+
     default:
        break;
     }
@@ -107,10 +121,10 @@ vec4_visitor::nir_setup_system_values()
        nir_system_values[i] = dst_reg();
     }
  
-   nir_foreach_overload(nir, overload) {
-      assert(strcmp(overload->function->name, "main") == 0);
-      assert(overload->impl);
-      nir_foreach_block(overload->impl, setup_system_values_block, this);
+   nir_foreach_function(nir, function) {
+      assert(strcmp(function->name, "main") == 0);
+      assert(function->impl);
+      nir_foreach_block(function->impl, setup_system_values_block, this);
     }
  }
  
@@ -318,6 +332,24 @@ vec4_visitor::get_nir_src(nir_src src, unsigned num_components)
     return get_nir_src(src, nir_type_int, num_components);
  }
  
+src_reg
+vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+   if (const_value) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into instr->const_index[0].
+       */
+      assert(const_value->u[0] == 0);
+      return src_reg();
+   }
+
+   return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
+}
+
  void
  vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
  {
@@ -641,7 +673,12 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
  
     case nir_intrinsic_load_vertex_id_zero_base:
     case nir_intrinsic_load_base_vertex:
-   case nir_intrinsic_load_instance_id: {
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id:
+   case nir_intrinsic_load_invocation_id:
+   case nir_intrinsic_load_tess_level_inner:
+   case nir_intrinsic_load_tess_level_outer: {
        gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
        src_reg val = src_reg(nir_system_values[sv]);
        assert(val.file != BAD_FILE);
@@ -881,6 +918,59 @@ brw_conditional_for_nir_comparison(nir_op op)
     }
  }
  
+bool
+vec4_visitor::optimize_predicate(nir_alu_instr *instr,
+                                 enum brw_predicate *predicate)
+{
+   if (!instr->src[0].src.is_ssa ||
+       instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *cmp_instr =
+      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+   switch (cmp_instr->op) {
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4:
+      *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4:
+      *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   default:
+      return false;
+   }
+
+   unsigned size_swizzle =
+      brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
+
+   src_reg op[2];
+   assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
+   for (unsigned i = 0; i < 2; i++) {
+      op[i] = get_nir_src(cmp_instr->src[i].src,
+                          nir_op_infos[cmp_instr->op].input_types[i], 4);
+      unsigned base_swizzle =
+         brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
+      op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle);
+      op[i].abs = cmp_instr->src[i].abs;
+      op[i].negate = cmp_instr->src[i].negate;
+   }
+
+   emit(CMP(dst_null_d(), op[0], op[1],
+            brw_conditional_for_nir_comparison(cmp_instr->op)));
+
+   return true;
+}
+
  void
  vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
  {
@@ -972,7 +1062,11 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
     case nir_op_umul_high: {
        struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
  
-      emit(MUL(acc, op[0], op[1]));
+      if (devinfo->gen >=8)
+         emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW)));
+      else
+         emit(MUL(acc, op[0], op[1]));
+
        emit(MACH(dst, op[0], op[1]));
        break;
     }
@@ -1008,9 +1102,41 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        break;
  
     case nir_op_umod:
+   case nir_op_irem:
+      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+       * appears that our hardware just does the right thing for signed
+       * remainder.
+       */
        emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
        break;
  
+   case nir_op_imod: {
+      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
+      inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+
+      /* Math instructions don't support conditional mod */
+      inst = emit(MOV(dst_null_d(), src_reg(dst)));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      /* Now, we need to determine if signs of the sources are different.
+       * When we XOR the sources, the top bit is 0 if they are the same and 1
+       * if they are different.  We can then use a conditional modifier to
+       * turn that into a predicate.  This leads us to an XOR.l instruction.
+       */
+      src_reg tmp = src_reg(this, glsl_type::ivec4_type);
+      inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      /* If the result of the initial remainder operation is non-zero and the
+       * two sources have different signs, add in a copy of op[1] to get the
+       * final integer modulus value.
+       */
+      inst = emit(ADD(dst, src_reg(dst), op[1]));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
     case nir_op_ldexp:
        unreachable("not reached: should be handled by ldexp_to_arith()");
  
@@ -1080,6 +1206,16 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        inst->saturate = instr->dest.saturate;
        break;
  
+   case nir_op_fquantize2f16: {
+      /* See also vec4_visitor::emit_pack_half_2x16() */
+      src_reg tmp = src_reg(this, glsl_type::uvec4_type);
+
+      emit(F32TO16(dst_reg(tmp), op[0]));
+      inst = emit(F16TO32(dst, tmp));
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
     case nir_op_fmin:
     case nir_op_imin:
     case nir_op_umin:
@@ -1288,6 +1424,9 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
  
     case nir_op_ubitfield_extract:
     case nir_op_ibitfield_extract:
+      unreachable("should have been lowered");
+   case nir_op_ubfe:
+   case nir_op_ibfe:
        op[0] = fix_3src_operand(op[0]);
        op[1] = fix_3src_operand(op[1]);
        op[2] = fix_3src_operand(op[2]);
@@ -1308,8 +1447,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        break;
  
     case nir_op_bitfield_insert:
-      unreachable("not reached: should be handled by "
-                  "lower_instructions::bitfield_insert_to_bfm_bfi");
+      unreachable("not reached: should have been lowered");
  
     case nir_op_fsign:
        /* AND(val, 0x80000000) gives the sign bit.
@@ -1371,25 +1509,29 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        break;
  
     case nir_op_bcsel:
-      emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
-      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
-      switch (dst.writemask) {
-      case WRITEMASK_X:
-         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
-         break;
-      case WRITEMASK_Y:
-         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
-         break;
-      case WRITEMASK_Z:
-         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
-         break;
-      case WRITEMASK_W:
-         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
-         break;
-      default:
-         inst->predicate = BRW_PREDICATE_NORMAL;
-         break;
+      enum brw_predicate predicate;
+      if (!optimize_predicate(instr, &predicate)) {
+         emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
+         switch (dst.writemask) {
+         case WRITEMASK_X:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
+            break;
+         case WRITEMASK_Y:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+            break;
+         case WRITEMASK_Z:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+            break;
+         case WRITEMASK_W:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
+            break;
+         default:
+            predicate = BRW_PREDICATE_NORMAL;
+            break;
+         }
        }
+      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
+      inst->predicate = predicate;
        break;
  
     case nir_op_fdot_replicated2:
@@ -1412,20 +1554,6 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        inst->saturate = instr->dest.saturate;
        break;
  
-   case nir_op_bany2:
-   case nir_op_bany3:
-   case nir_op_bany4: {
-      unsigned swiz =
-         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
-
-      emit(CMP(dst_null_d(), swizzle(op[0], swiz), brw_imm_d(0),
-               BRW_CONDITIONAL_NZ));
-      emit(MOV(dst, brw_imm_d(0)));
-      inst = emit(MOV(dst, brw_imm_d(~0)));
-      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
-      break;
-   }
-
     case nir_op_fabs:
     case nir_op_iabs:
     case nir_op_fneg:
@@ -1475,7 +1603,6 @@ vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
        break;
  
     case nir_jump_return:
-      /* fall through */
     default:
        unreachable("unknown jump");
     }
@@ -1528,7 +1655,9 @@ glsl_type_for_nir_alu_type(nir_alu_type alu_type,
  void
  vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
  {
+   unsigned texture = instr->texture_index;
     unsigned sampler = instr->sampler_index;
+   src_reg texture_reg = brw_imm_ud(texture);
     src_reg sampler_reg = brw_imm_ud(sampler);
     src_reg coordinate;
     const glsl_type *coord_type = NULL;
@@ -1543,6 +1672,10 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
                                   nir_tex_instr_dest_size(instr));
     dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
  
+   /* Our hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      lod = brw_imm_d(0);
+
     /* Load the texture operation sources */
     for (unsigned i = 0; i < instr->num_srcs; i++) {
        switch (instr->src[i].src_type) {
@@ -1604,13 +1737,12 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
           offset_value = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
           break;
  
-      case nir_tex_src_sampler_offset: {
-         /* The highest sampler which may be used by this operation is
+      case nir_tex_src_texture_offset: {
+         /* The highest texture which may be used by this operation is
            * the last element of the array. Mark it here, because the generator
            * doesn't have enough information to determine the bound.
            */
-         uint32_t array_size = instr->sampler_array_size;
-         uint32_t max_used = sampler + array_size - 1;
+         uint32_t max_used = texture + instr->texture_array_size - 1;
           if (instr->op == nir_texop_tg4) {
              max_used += prog_data->base.binding_table.gather_texture_start;
           } else {
@@ -1622,6 +1754,15 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
           /* Emit code to evaluate the actual indexing expression */
           src_reg src = get_nir_src(instr->src[i].src, 1);
           src_reg temp(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(temp), src, brw_imm_ud(texture)));
+         texture_reg = emit_uniformize(temp);
+         break;
+      }
+
+      case nir_tex_src_sampler_offset: {
+         /* Emit code to evaluate the actual indexing expression */
+         src_reg src = get_nir_src(instr->src[i].src, 1);
+         src_reg temp(this, glsl_type::uint_type);
           emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler)));
           sampler_reg = emit_uniformize(temp);
           break;
@@ -1660,7 +1801,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
     /* Stuff the channel select bits in the top of the texture offset */
     if (instr->op == nir_texop_tg4) {
        if (instr->component == 1 &&
-          (key_tex->gather_channel_quirk_mask & (1 << sampler))) {
+          (key_tex->gather_channel_quirk_mask & (1 << texture))) {
           /* gather4 sampler is broken for green channel on RG32F --
            * we must ask for blue instead.
            */
@@ -1681,7 +1822,8 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
                  shadow_comparitor,
                  lod, lod2, sample_index,
                  constant_offset, offset_value,
-                mcs, is_cube_array, sampler, sampler_reg);
+                mcs, is_cube_array,
+                texture, texture_reg, sampler, sampler_reg);
  }
  
  void
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp

new file mode 100644 (file)

index 0000000..3d83152
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -0,0 +1,587 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.cpp
+ *
+ * Tessellaton control shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4_tcs.h"
+
+namespace brw {
+
+vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
+                                   void *log_data,
+                                   const struct brw_tcs_prog_key *key,
+                                   struct brw_tcs_prog_data *prog_data,
+                                   const nir_shader *nir,
+                                   void *mem_ctx,
+                                   int shader_time_index,
+                                   const struct brw_vue_map *input_vue_map)
+   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+                  nir, mem_ctx, false, shader_time_index),
+     input_vue_map(input_vue_map), key(key)
+{
+}
+
+
+void
+vec4_tcs_visitor::emit_nir_code()
+{
+   if (key->program_string_id != 0) {
+      /* We have a real application-supplied TCS, emit real code. */
+      vec4_visitor::emit_nir_code();
+   } else {
+      /* There is no TCS; automatically generate a passthrough shader
+       * that writes the API-specified default tessellation levels and
+       * copies VS outputs to TES inputs.
+       */
+      uniforms = 2;
+
+      uint64_t varyings = key->outputs_written;
+
+      src_reg vertex_offset(this, glsl_type::uint_type);
+      emit(MUL(dst_reg(vertex_offset), invocation_id,
+               brw_imm_ud(prog_data->vue_map.num_per_vertex_slots)));
+
+      while (varyings != 0) {
+         const int varying = ffsll(varyings) - 1;
+
+         unsigned in_offset = input_vue_map->varying_to_slot[varying];
+         unsigned out_offset = prog_data->vue_map.varying_to_slot[varying];
+         assert(out_offset >= 2);
+
+         dst_reg val(this, glsl_type::vec4_type);
+         emit_input_urb_read(val, invocation_id, in_offset, src_reg());
+         emit_urb_write(src_reg(val), WRITEMASK_XYZW, out_offset,
+                        vertex_offset);
+
+         varyings &= ~BITFIELD64_BIT(varying);
+      }
+
+      /* Only write the tessellation factors from invocation 0.
+       * There's no point in making other threads do redundant work.
+       */
+      emit(CMP(dst_null_d(), invocation_id, brw_imm_ud(0),
+               BRW_CONDITIONAL_EQ));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      emit_urb_write(src_reg(UNIFORM, 0, glsl_type::vec4_type),
+                     WRITEMASK_XYZW, 0, src_reg());
+      emit_urb_write(src_reg(UNIFORM, 1, glsl_type::vec4_type),
+                     WRITEMASK_XYZW, 1, src_reg());
+      emit(BRW_OPCODE_ENDIF);
+   }
+}
+
+void
+vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+}
+
+dst_reg *
+vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type *type)
+{
+   return NULL;
+}
+
+
+void
+vec4_tcs_visitor::setup_payload()
+{
+   int reg = 0;
+
+   /* The payload always contains important data in r0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg++;
+
+   /* r1.0 - r4.7 may contain the input control point URB handles,
+    * which we use to pull vertex data.
+    */
+   reg += 4;
+
+   /* Push constants may start at r5.0 */
+   reg = setup_uniforms(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tcs_visitor::emit_prolog()
+{
+   invocation_id = src_reg(this, glsl_type::uint_type);
+   emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
+
+   /* HS threads are dispatched with the dispatch mask set to 0xFF.
+    * If there are an odd number of output vertices, then the final
+    * HS instance dispatched will only have its bottom half doing real
+    * work, and so we need to disable the upper half:
+    */
+   if (nir->info.tcs.vertices_out % 2) {
+      emit(CMP(dst_null_d(), invocation_id,
+               brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L));
+
+      /* Matching ENDIF is in emit_thread_end() */
+      emit(IF(BRW_PREDICATE_NORMAL));
+   }
+}
+
+
+void
+vec4_tcs_visitor::emit_thread_end()
+{
+   vec4_instruction *inst;
+   current_annotation = "thread end";
+
+   if (nir->info.tcs.vertices_out % 2) {
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   if (devinfo->gen == 7) {
+      struct brw_tcs_prog_data *tcs_prog_data =
+         (struct brw_tcs_prog_data *) prog_data;
+
+      current_annotation = "release input vertices";
+
+      /* Synchronize all threads, so we know that no one is still
+       * using the input URB handles.
+       */
+      if (tcs_prog_data->instances > 1) {
+         dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+         emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+         emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      }
+
+      /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
+       * We want to compare the bottom half of invocation_id with 0, but
+       * use that truth value for the top half as well.  Unfortunately,
+       * we don't have stride in the vec4 world, nor UV immediates in
+       * align16, so we need an opcode to get invocation_id<0,4,0>.
+       */
+      emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id);
+      emit(IF(BRW_PREDICATE_NORMAL));
+      for (unsigned i = 0; i < key->input_vertices; i += 2) {
+         /* If we have an odd number of input vertices, the last will be
+          * unpaired.  We don't want to use an interleaved URB write in
+          * that case.
+          */
+         const bool is_unpaired = i == key->input_vertices - 1;
+
+         dst_reg header(this, glsl_type::uvec4_type);
+         emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i),
+              brw_imm_ud(is_unpaired));
+      }
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      emit_shader_time_end();
+
+   inst = emit(TCS_OPCODE_THREAD_END);
+   inst->base_mrf = 14;
+   inst->mlen = 1;
+}
+
+
+void
+vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
+                                      const src_reg &vertex_index,
+                                      unsigned base_offset,
+                                      const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+   dst_reg temp(this, glsl_type::ivec4_type);
+   temp.type = dst.type;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+   inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
+               indirect_offset);
+   inst->force_writemask_all = true;
+
+   /* Read into a temporary, ignoring writemasking. */
+   inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+   inst->offset = base_offset;
+   inst->mlen = 1;
+   inst->base_mrf = -1;
+
+   /* Copy the temporary to the destination to deal with writemasking.
+    *
+    * Also attempt to deal with gl_PointSize being in the .w component.
+    */
+   if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+      emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
+   } else {
+      emit(MOV(dst, src_reg(temp)));
+   }
+}
+
+void
+vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
+                                       unsigned base_offset,
+                                       const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
+               brw_imm_ud(dst.writemask), indirect_offset);
+   inst->force_writemask_all = true;
+
+   /* Read into a temporary, ignoring writemasking. */
+   vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
+   read->offset = base_offset;
+   read->mlen = 1;
+   read->base_mrf = -1;
+}
+
+void
+vec4_tcs_visitor::emit_urb_write(const src_reg &value,
+                                 unsigned writemask,
+                                 unsigned base_offset,
+                                 const src_reg &indirect_offset)
+{
+   if (writemask == 0)
+      return;
+
+   src_reg message(this, glsl_type::uvec4_type, 2);
+   vec4_instruction *inst;
+
+   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
+               brw_imm_ud(writemask), indirect_offset);
+   inst->force_writemask_all = true;
+   inst = emit(MOV(offset(dst_reg(retype(message, value.type)), 1), value));
+   inst->force_writemask_all = true;
+
+   inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message);
+   inst->offset = base_offset;
+   inst->mlen = 2;
+   inst->base_mrf = -1;
+}
+
+static unsigned
+tesslevel_outer_components(GLenum tes_primitive_mode)
+{
+   switch (tes_primitive_mode) {
+   case GL_QUADS:
+      return 4;
+   case GL_TRIANGLES:
+      return 3;
+   case GL_ISOLINES:
+      return 2;
+   default:
+      unreachable("Bogus tessellation domain");
+   }
+   return 0;
+}
+
+static unsigned
+tesslevel_inner_components(GLenum tes_primitive_mode)
+{
+   switch (tes_primitive_mode) {
+   case GL_QUADS:
+      return 2;
+   case GL_TRIANGLES:
+      return 1;
+   case GL_ISOLINES:
+      return 0;
+   default:
+      unreachable("Bogus tessellation domain");
+   }
+   return 0;
+}
+
+/**
+ * Given a normal .xyzw writemask, convert it to a writemask for a vector
+ * that's stored backwards, i.e. .wzyx.
+ */
+static unsigned
+writemask_for_backwards_vector(unsigned mask)
+{
+   unsigned new_mask = 0;
+
+   for (int i = 0; i < 4; i++)
+      new_mask |= ((mask >> i) & 1) << (3 - i);
+
+   return new_mask;
+}
+
+void
+vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_invocation_id:
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD),
+               invocation_id));
+      break;
+   case nir_intrinsic_load_primitive_id:
+      emit(TCS_OPCODE_GET_PRIMITIVE_ID,
+           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D),
+               brw_imm_d(key->input_vertices)));
+      break;
+   case nir_intrinsic_load_per_vertex_input: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
+      src_reg vertex_index =
+         vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0]))
+                      : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+
+      emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset);
+      break;
+   }
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
+      break;
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];;
+
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+
+      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
+         dst.type = BRW_REGISTER_TYPE_F;
+
+         /* This is a read of gl_TessLevelInner[], which lives in the
+          * Patch URB header.  The layout depends on the domain.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS: {
+            /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */
+            dst_reg tmp(this, glsl_type::vec4_type);
+            emit_output_urb_read(tmp, 0, src_reg());
+            emit(MOV(writemask(dst, WRITEMASK_XY),
+                     swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+            break;
+         }
+         case GL_TRIANGLES:
+            /* DWord 4; use offset 1 but normal swizzle/writemask. */
+            emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg());
+            break;
+         case GL_ISOLINES:
+            /* All channels are undefined. */
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
+         dst.type = BRW_REGISTER_TYPE_F;
+
+         /* This is a read of gl_TessLevelOuter[], which lives in the
+          * high 4 DWords of the Patch URB header, in reverse order.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS:
+            dst.writemask = WRITEMASK_XYZW;
+            break;
+         case GL_TRIANGLES:
+            dst.writemask = WRITEMASK_XYZ;
+            break;
+         case GL_ISOLINES:
+            dst.writemask = WRITEMASK_XY;
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+
+         dst_reg tmp(this, glsl_type::vec4_type);
+         emit_output_urb_read(tmp, 1, src_reg());
+         emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+      } else {
+         emit_output_urb_read(dst, imm_offset, indirect_offset);
+      }
+      break;
+   }
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      src_reg value = get_nir_src(instr->src[0]);
+      unsigned mask = instr->const_index[1];
+      unsigned swiz = BRW_SWIZZLE_XYZW;
+
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
+         value.type = BRW_REGISTER_TYPE_F;
+
+         mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1;
+
+         /* This is a write to gl_TessLevelInner[], which lives in the
+          * Patch URB header.  The layout depends on the domain.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS:
+            /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
+             * We use an XXYX swizzle to reverse put .xy in the .wz
+             * channels, and use a .zw writemask.
+             */
+            swiz = BRW_SWIZZLE4(0, 0, 1, 0);
+            mask = writemask_for_backwards_vector(mask);
+            break;
+         case GL_TRIANGLES:
+            /* gl_TessLevelInner[].x lives at DWord 4, so we set the
+             * writemask to X and bump the URB offset by 1.
+             */
+            imm_offset = 1;
+            break;
+         case GL_ISOLINES:
+            /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
+         value.type = BRW_REGISTER_TYPE_F;
+
+         mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1;
+
+         /* This is a write to gl_TessLevelOuter[] which lives in the
+          * Patch URB Header at DWords 4-7.  However, it's reversed, so
+          * instead of .xyzw we have .wzyx.
+          */
+         swiz = BRW_SWIZZLE_WZYX;
+         mask = writemask_for_backwards_vector(mask);
+      }
+
+      emit_urb_write(swizzle(value, swiz), mask,
+                     imm_offset, indirect_offset);
+      break;
+   }
+
+   case nir_intrinsic_barrier: {
+      dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+      emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+      emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      break;
+   }
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+
+
+extern "C" const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tcs_prog_key *key,
+                struct brw_tcs_prog_data *prog_data,
+                const nir_shader *src_shader,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str)
+{
+   const struct brw_device_info *devinfo = compiler->devinfo;
+   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
+   nir->info.outputs_written = key->outputs_written;
+   nir->info.patch_outputs_written = key->patch_outputs_written;
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
+   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
+
+   prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
+
+   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
+                            nir->info.outputs_written,
+                            nir->info.patch_outputs_written);
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     32 bytes for the patch header (tessellation factors)
+    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
+    *              gl_MaxTessPatchComponents = 120)
+    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
+    *              gl_MaxPatchVertices = 32 and
+    *              gl_MaxTessControlOutputComponents = 128)
+    *
+    *  15808 bytes left for varying packing overhead
+    */
+   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
+   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
+   unsigned output_size_bytes = 0;
+   /* Note that the patch header is counted in num_per_patch_slots. */
+   output_size_bytes += num_per_patch_slots * 16;
+   output_size_bytes += nir->info.tcs.vertices_out * num_per_vertex_slots * 16;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES)
+      return false;
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   struct brw_vue_map input_vue_map;
+   brw_compute_vue_map(devinfo, &input_vue_map,
+                       nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
+                       true);
+
+   /* HS does not use the usual payload pushing from URB to GRFs,
+    * because we don't have enough registers for a full-size payload, and
+    * the hardware is broken on Haswell anyway.
+    */
+   vue_prog_data->urb_read_length = 0;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+      fprintf(stderr, "TCS Input ");
+      brw_print_vue_map(stderr, &input_vue_map);
+      fprintf(stderr, "TCS Output ");
+      brw_print_vue_map(stderr, &vue_prog_data->vue_map);
+   }
+
+   vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+                      nir, mem_ctx, shader_time_index, &input_vue_map);
+   if (!v.run()) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+      return NULL;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+      v.dump_instructions();
+
+   return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+                                     &prog_data->base, v.cfg,
+                                     final_assembly_size);
+}
+
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.h b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h

new file mode 100644 (file)

index 0000000..2c6801b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.h
+ *
+ * The vec4-mode tessellation control shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TCS_H
+#define BRW_VEC4_TCS_H
+
+#include "brw_compiler.h"
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tcs_visitor : public vec4_visitor
+{
+public:
+   vec4_tcs_visitor(const struct brw_compiler *compiler,
+                    void *log_data,
+                    const struct brw_tcs_prog_key *key,
+                    struct brw_tcs_prog_data *prog_data,
+                    const nir_shader *nir,
+                    void *mem_ctx,
+                    int shader_time_index,
+                    const struct brw_vue_map *input_vue_map);
+
+protected:
+   virtual void emit_nir_code();
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+   void emit_input_urb_read(const dst_reg &dst,
+                            const src_reg &vertex_index,
+                            unsigned base_offset,
+                            const src_reg &indirect_offset);
+   void emit_output_urb_read(const dst_reg &dst,
+                             unsigned base_offset,
+                             const src_reg &indirect_offset);
+
+   void emit_urb_write(const src_reg &value, unsigned writemask,
+                       unsigned base_offset, const src_reg &indirect_offset);
+
+   /* we do not use the normal end-of-shader URB write mechanism -- but every vec4 stage
+    * must provide implementations of these:
+    */
+   virtual void emit_urb_write_header(int mrf) {}
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return NULL; }
+
+   const struct brw_vue_map *input_vue_map;
+
+   const struct brw_tcs_prog_key *key;
+   src_reg invocation_id;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TCS_H */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp

new file mode 100644 (file)

index 0000000..ce5fefc
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tes.cpp
+ *
+ * Tessellaton evaluation shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_vec4_tes.h"
+
+namespace brw {
+
+vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler,
+                                  void *log_data,
+                                  const struct brw_tes_prog_key *key,
+                                  struct brw_tes_prog_data *prog_data,
+                                  const nir_shader *shader,
+                                  void *mem_ctx,
+                                  int shader_time_index)
+   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+                  shader, mem_ctx, false, shader_time_index)
+{
+}
+
+
+dst_reg *
+vec4_tes_visitor::make_reg_for_system_value(int location, const glsl_type *type)
+{
+   return NULL;
+}
+
+void
+vec4_tes_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   const struct brw_tes_prog_data *tes_prog_data =
+      (const struct brw_tes_prog_data *) prog_data;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_tess_level_outer: {
+      dst_reg dst(this, glsl_type::vec4_type);
+      nir_system_values[SYSTEM_VALUE_TESS_LEVEL_OUTER] = dst;
+
+      dst_reg temp(this, glsl_type::vec4_type);
+      vec4_instruction *read =
+         emit(VEC4_OPCODE_URB_READ, temp, input_read_header);
+      read->offset = 1;
+      read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+      emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WZYX)));
+      break;
+   }
+   case nir_intrinsic_load_tess_level_inner: {
+      dst_reg dst(this, glsl_type::vec2_type);
+      nir_system_values[SYSTEM_VALUE_TESS_LEVEL_INNER] = dst;
+
+      /* Set up the message header to reference the proper parts of the URB */
+      dst_reg temp(this, glsl_type::vec4_type);
+      vec4_instruction *read =
+         emit(VEC4_OPCODE_URB_READ, temp, input_read_header);
+      read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+      if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {
+         emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WZYX)));
+      } else {
+         read->offset = 1;
+         emit(MOV(dst, src_reg(temp)));
+      }
+      break;
+   }
+   default:
+      vec4_visitor::nir_setup_system_value_intrinsic(instr);
+   }
+}
+
+
+void
+vec4_tes_visitor::setup_payload()
+{
+   int reg = 0;
+
+   /* The payload always contains important data in r0 and r1, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg += 2;
+
+   reg = setup_uniforms(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tes_visitor::emit_prolog()
+{
+   input_read_header = src_reg(this, glsl_type::uvec4_type);
+   emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header));
+
+   this->current_annotation = NULL;
+}
+
+
+void
+vec4_tes_visitor::emit_urb_write_header(int mrf)
+{
+   /* No need to do anything for DS; an implied write to this MRF will be
+    * performed by VS_OPCODE_URB_WRITE.
+    */
+   (void) mrf;
+}
+
+
+vec4_instruction *
+vec4_tes_visitor::emit_urb_write_opcode(bool complete)
+{
+   /* For DS, the URB writes end the thread. */
+   if (complete) {
+      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+         emit_shader_time_end();
+   }
+
+   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = complete ?
+      BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
+
+   return inst;
+}
+
+void
+vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_tess_coord:
+      /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+               src_reg(brw_vec8_grf(1, 0))));
+      break;
+   case nir_intrinsic_load_primitive_id:
+      emit(TES_OPCODE_GET_PRIMITIVE_ID,
+           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+      src_reg header = input_read_header;
+
+      if (indirect_offset.file != BAD_FILE) {
+         header = src_reg(this, glsl_type::uvec4_type);
+         emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
+              input_read_header, indirect_offset);
+      }
+
+      dst_reg temp(this, glsl_type::ivec4_type);
+      vec4_instruction *read =
+         emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+      read->offset = imm_offset;
+      read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+      /* Copy to target.  We might end up with some funky writemasks landing
+       * in here, but we really don't want them in the above pseudo-ops.
+       */
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+      emit(MOV(dst, src_reg(temp)));
+      break;
+   }
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+
+
+void
+vec4_tes_visitor::emit_thread_end()
+{
+   /* For DS, we always end the thread by emitting a single vertex.
+    * emit_urb_write_opcode() will take care of setting the eot flag on the
+    * SEND instruction.
+    */
+   emit_vertex();
+}
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tes.h b/src/mesa/drivers/dri/i965/brw_vec4_tes.h

new file mode 100644 (file)

index 0000000..4b697aa
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tes.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tes.h
+ *
+ * The vec4 mode tessellation evaluation shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TES_H
+#define BRW_VEC4_TES_H
+
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tes_visitor : public vec4_visitor
+{
+public:
+   vec4_tes_visitor(const struct brw_compiler *compiler,
+                   void *log_data,
+                   const struct brw_tes_prog_key *key,
+                   struct brw_tes_prog_data *prog_data,
+                   const nir_shader *nir,
+                   void *mem_ctx,
+                   int shader_time_index);
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+
+   virtual void emit_urb_write_header(int mrf);
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+
+private:
+   src_reg input_read_header;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TES_H */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

index 357dd4d..0c5bfb8 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -622,6 +622,7 @@ type_size_vec4(const struct glsl_type *type)
     case GLSL_TYPE_DOUBLE:
     case GLSL_TYPE_ERROR:
     case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_FUNCTION:
        unreachable("not reached");
     }
  
@@ -877,6 +878,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
                             src_reg offset_value,
                             src_reg mcs,
                             bool is_cube_array,
+                           uint32_t surface,
+                           src_reg surface_reg,
                             uint32_t sampler,
                             src_reg sampler_reg)
  {
@@ -942,7 +945,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
     inst->dst.writemask = WRITEMASK_XYZW;
     inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
  
-   inst->src[1] = sampler_reg;
+   inst->src[1] = surface_reg;
+   inst->src[2] = sampler_reg;
  
     /* MRF for the first parameter */
     int param_base = inst->base_mrf + inst->header_size;
@@ -1068,7 +1072,7 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
     }
  
     if (devinfo->gen == 6 && op == ir_tg4) {
-      emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
+      emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
     }
  
     if (op == ir_query_levels) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp

index 205323c..86701f3 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -155,7 +155,11 @@ vec4_vs_visitor::make_reg_for_system_value(int location,
     switch (location) {
     case SYSTEM_VALUE_BASE_VERTEX:
        reg->writemask = WRITEMASK_X;
-      vs_prog_data->uses_vertexid = true;
+      vs_prog_data->uses_basevertex = true;
+      break;
+   case SYSTEM_VALUE_BASE_INSTANCE:
+      reg->writemask = WRITEMASK_Y;
+      vs_prog_data->uses_baseinstance = true;
        break;
     case SYSTEM_VALUE_VERTEX_ID:
     case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
@@ -166,6 +170,11 @@ vec4_vs_visitor::make_reg_for_system_value(int location,
        reg->writemask = WRITEMASK_W;
        vs_prog_data->uses_instanceid = true;
        break;
+   case SYSTEM_VALUE_DRAW_ID:
+      reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX + 1);
+      reg->writemask = WRITEMASK_X;
+      vs_prog_data->uses_drawid = true;
+      break;
     default:
        unreachable("not reached");
     }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c

index 59b748f..3095d82 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -148,7 +148,9 @@ brw_codegen_vs_prog(struct brw_context *brw,
  
     brw_compute_vue_map(brw->intelScreen->devinfo,
                         &prog_data.base.vue_map, outputs_written,
-                       prog ? prog->SeparateShader : false);
+                       prog ? prog->SeparateShader ||
+                              prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]
+                            : false);
  
     if (0) {
        _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c

index 6cb3da4..fea2436 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -176,6 +176,73 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
     }
  
     vue_map->num_slots = separate ? slot + 1 : slot;
+   vue_map->num_per_vertex_slots = 0;
+   vue_map->num_per_patch_slots = 0;
+}
+
+/**
+ * Compute the VUE map for tessellation control shader outputs and
+ * tessellation evaluation shader inputs.
+ */
+void
+brw_compute_tess_vue_map(struct brw_vue_map *vue_map,
+                         GLbitfield64 vertex_slots,
+                         GLbitfield patch_slots)
+{
+   /* I don't think anything actually uses this... */
+   vue_map->slots_valid = vertex_slots;
+
+   vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER |
+                     VARYING_BIT_TESS_LEVEL_INNER);
+
+   /* Make sure that the values we store in vue_map->varying_to_slot and
+    * vue_map->slot_to_varying won't overflow the signed chars that are used
+    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
+    * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that
+    * VARYING_SLOT_TESS_MAX is <= 127, not 128.
+    */
+   STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127);
+
+   for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) {
+      vue_map->varying_to_slot[i] = -1;
+      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
+   }
+
+   int slot = 0;
+
+   /* The first 8 DWords are reserved for the "Patch Header".
+    *
+    * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout
+    * depends on the domain type.  They might not be in slots 0 and 1 as
+    * described here, but pretending they're separate allows us to uniquely
+    * identify them by distinct slot locations.
+    */
+   assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++);
+   assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++);
+
+   /* first assign per-patch varyings */
+   while (patch_slots != 0) {
+      const int varying = ffsll(patch_slots) - 1;
+      if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) {
+         assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++);
+      }
+      patch_slots &= ~BITFIELD64_BIT(varying);
+   }
+
+   /* apparently, including the patch header... */
+   vue_map->num_per_patch_slots = slot;
+
+   /* then assign per-vertex varyings for each vertex in our patch */
+   while (vertex_slots != 0) {
+      const int varying = ffsll(vertex_slots) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      vertex_slots &= ~BITFIELD64_BIT(varying);
+   }
+
+   vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots;
+   vue_map->num_slots = slot;
  }
  
  static const char *
@@ -190,17 +257,35 @@ varying_name(brw_varying_slot slot)
        [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
     };
  
+   assert(slot < BRW_VARYING_SLOT_COUNT);
     return brw_names[slot - VARYING_SLOT_MAX];
  }
  
  void
  brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map)
  {
-   fprintf(fp, "VUE map (%d slots, %s)\n",
-           vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
-   for (int i = 0; i < vue_map->num_slots; i++) {
-      fprintf(fp, "  [%d] %s\n", i,
-              varying_name(vue_map->slot_to_varying[i]));
+   if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) {
+      fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n",
+              vue_map->num_slots,
+              vue_map->num_per_patch_slots,
+              vue_map->num_per_vertex_slots,
+              vue_map->separate ? "SSO" : "non-SSO");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) {
+            fprintf(fp, "  [%d] VARYING_SLOT_PATCH%d\n", i,
+                    vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0);
+         } else {
+            fprintf(fp, "  [%d] %s\n", i,
+                    varying_name(vue_map->slot_to_varying[i]));
+         }
+      }
+   } else {
+      fprintf(fp, "VUE map (%d slots, %s)\n",
+              vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         fprintf(fp, "  [%d] %s\n", i,
+                 varying_name(vue_map->slot_to_varying[i]));
+      }
     }
     fprintf(fp, "\n");
  }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c

index c4ebbf3..5ab2f7f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -862,10 +862,8 @@ brw_update_texture_surfaces(struct brw_context *brw)
     /* BRW_NEW_VERTEX_PROGRAM */
     struct gl_program *vs = (struct gl_program *) brw->vertex_program;
  
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
-
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
     struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
  
     /* BRW_NEW_GEOMETRY_PROGRAM */
@@ -915,8 +913,7 @@ const struct brw_tracked_state brw_texture_surfaces = {
               BRW_NEW_FS_PROG_DATA |
               BRW_NEW_GEOMETRY_PROGRAM |
               BRW_NEW_GS_PROG_DATA |
-             BRW_NEW_TESS_CTRL_PROGRAM |
-             BRW_NEW_TESS_EVAL_PROGRAM |
+             BRW_NEW_TESS_PROGRAMS |
               BRW_NEW_TCS_PROG_DATA |
               BRW_NEW_TES_PROG_DATA |
               BRW_NEW_TEXTURE_BUFFER |
@@ -949,12 +946,15 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
        } else {
           struct intel_buffer_object *intel_bo =
              intel_buffer_object(binding->BufferObject);
+         GLsizeiptr size = binding->BufferObject->Size - binding->Offset;
+         if (!binding->AutomaticSize)
+            size = MIN2(size, binding->Size);
           drm_intel_bo *bo =
              intel_bufferobj_buffer(brw, intel_bo,
                                     binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset);
+                                   size);
           brw_create_constant_surface(brw, bo, binding->Offset,
-                                     binding->BufferObject->Size - binding->Offset,
+                                     size,
                                       &ubo_surf_offsets[i]);
        }
     }
@@ -971,12 +971,15 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
        } else {
           struct intel_buffer_object *intel_bo =
              intel_buffer_object(binding->BufferObject);
+         GLsizeiptr size = binding->BufferObject->Size - binding->Offset;
+         if (!binding->AutomaticSize)
+            size = MIN2(size, binding->Size);
           drm_intel_bo *bo =
              intel_bufferobj_buffer(brw, intel_bo,
                                     binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset);
+                                   size);
           brw_create_buffer_surface(brw, bo, binding->Offset,
-                                   binding->BufferObject->Size - binding->Offset,
+                                   size,
                                     &ssbo_surf_offsets[i]);
        }
     }
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c

index 6653a6d..4bc0a85 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -32,7 +32,6 @@
  #include "program/prog_parameter.h"
  #include "program/prog_statevars.h"
  #include "intel_batchbuffer.h"
-#include "glsl/glsl_parser_extras.h"
  
  /**
   * Creates a streamed BO containing the push constants for the VS or GS on
@@ -65,7 +64,8 @@ gen6_upload_push_constants(struct brw_context *brw,
         * basic type of PROGRAM_STATE_VAR.
         */
        /* XXX: Should this happen somewhere before to get our state flag set? */
-      _mesa_load_state_parameters(ctx, prog->Parameters);
+      if (prog)
+         _mesa_load_state_parameters(ctx, prog->Parameters);
  
        gl_constant_value *param;
        unsigned i;
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp

index e87b9d1..89b73ca 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -50,6 +50,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
     unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16;
     gen7_emit_push_constant_state(brw,
                                   urb_size / 2 /* vs_size */,
+                                 0 /* hs_size */,
+                                 0 /* ds_size */,
                                   0 /* gs_size */,
                                   urb_size / 2 /* fs_size */);
  
@@ -60,6 +62,12 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
                         32 /* num_vs_entries */,
                         2 /* vs_size */,
                         2 /* vs_start */,
+                       0 /* num_hs_entries */,
+                       1 /* hs_size */,
+                       2 /* hs_start */,
+                       0 /* num_ds_entries */,
+                       1 /* ds_size */,
+                       2 /* ds_start */,
                         0 /* num_gs_entries */,
                         1 /* gs_size */,
                         2 /* gs_start */);
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c

index 1fde69c..a025bb9 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -141,7 +141,7 @@ brw_upload_cs_state(struct brw_context *brw)
        BEGIN_BATCH(4);
        OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));
        OUT_BATCH(0);
-      OUT_BATCH(reg_aligned_constant_size * threads);
+      OUT_BATCH(ALIGN(reg_aligned_constant_size * threads, 64));
        OUT_BATCH(stage_state->push_const_offset);
        ADVANCE_BATCH();
     }
@@ -249,8 +249,8 @@ brw_upload_cs_push_constants(struct brw_context *brw,
  
        param = (gl_constant_value*)
           brw_state_batch(brw, type,
-                         reg_aligned_constant_size * threads,
-                         32, &stage_state->push_const_offset);
+                         ALIGN(reg_aligned_constant_size * threads, 64),
+                         64, &stage_state->push_const_offset);
        assert(param);
  
        STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
diff --git a/src/mesa/drivers/dri/i965/gen7_ds_state.c b/src/mesa/drivers/dri/i965/gen7_ds_state.c

index 4d3d94f..30deabb 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_ds_state.c
@@ -30,7 +30,7 @@ static void
  gen7_upload_tes_push_constants(struct brw_context *brw)
  {
     struct brw_stage_state *stage_state = &brw->tes.base;
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     const struct brw_tess_eval_program *tep =
        (struct brw_tess_eval_program *) brw->tess_eval_program;
  
@@ -49,7 +49,7 @@ const struct brw_tracked_state gen7_tes_push_constants = {
        .mesa  = _NEW_PROGRAM_CONSTANTS,
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-               BRW_NEW_TESS_EVAL_PROGRAM |
+               BRW_NEW_TESS_PROGRAMS |
                 BRW_NEW_TES_PROG_DATA,
     },
     .emit = gen7_upload_tes_push_constants,
@@ -58,36 +58,64 @@ const struct brw_tracked_state gen7_tes_push_constants = {
  static void
  gen7_upload_ds_state(struct brw_context *brw)
  {
-   /* Disable the DS Unit */
-   BEGIN_BATCH(7);
-   OUT_BATCH(_3DSTATE_CONSTANT_DS << 16 | (7 - 2));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
+   const struct brw_stage_state *stage_state = &brw->tes.base;
+   /* BRW_NEW_TESS_PROGRAMS */
+   bool active = brw->tess_eval_program;
  
-   BEGIN_BATCH(6);
-   OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
+   /* BRW_NEW_TES_PROG_DATA */
+   const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data;
+   const struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base;
+   const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
  
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_DS << 16 | (2 - 2));
-   OUT_BATCH(brw->hw_bt_pool.next_offset);
-   ADVANCE_BATCH();
+   const unsigned thread_count = (brw->max_ds_threads - 1) <<
+      (brw->is_haswell ? HSW_DS_MAX_THREADS_SHIFT : GEN7_DS_MAX_THREADS_SHIFT);
+
+   if (active) {
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
+      OUT_BATCH(stage_state->prog_offset);
+      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
+                          GEN7_DS_SAMPLER_COUNT) |
+                SET_FIELD(prog_data->binding_table.size_bytes / 4,
+                          GEN7_DS_BINDING_TABLE_ENTRY_COUNT));
+      if (prog_data->total_scratch) {
+         OUT_RELOC(stage_state->scratch_bo,
+                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                   ffs(prog_data->total_scratch) - 11);
+      } else {
+         OUT_BATCH(0);
+      }
+      OUT_BATCH(SET_FIELD(prog_data->dispatch_grf_start_reg,
+                          GEN7_DS_DISPATCH_START_GRF) |
+                SET_FIELD(vue_prog_data->urb_read_length,
+                          GEN7_DS_URB_READ_LENGTH));
+
+      OUT_BATCH(GEN7_DS_ENABLE |
+                GEN7_DS_STATISTICS_ENABLE |
+                thread_count |
+                (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ?
+                 GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0));
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+   brw->tes.enabled = active;
  }
  
  const struct brw_tracked_state gen7_ds_state = {
     .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_CONTEXT,
+      .mesa  = _NEW_TRANSFORM,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_CONTEXT |
+               BRW_NEW_TESS_PROGRAMS |
+               BRW_NEW_TES_PROG_DATA,
     },
     .emit = gen7_upload_ds_state,
  };
diff --git a/src/mesa/drivers/dri/i965/gen7_hs_state.c b/src/mesa/drivers/dri/i965/gen7_hs_state.c

index fcaa919..0e2b3b2 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_hs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_hs_state.c
@@ -30,26 +30,28 @@ static void
  gen7_upload_tcs_push_constants(struct brw_context *brw)
  {
     struct brw_stage_state *stage_state = &brw->tcs.base;
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     const struct brw_tess_ctrl_program *tcp =
        (struct brw_tess_ctrl_program *) brw->tess_ctrl_program;
+   bool active = brw->tess_eval_program;
  
-   if (tcp) {
+   if (active) {
        /* BRW_NEW_TCS_PROG_DATA */
        const struct brw_stage_prog_data *prog_data = &brw->tcs.prog_data->base.base;
        gen6_upload_push_constants(brw, &tcp->program.Base, prog_data,
                                        stage_state, AUB_TRACE_VS_CONSTANTS);
     }
  
-   gen7_upload_constant_state(brw, stage_state, tcp, _3DSTATE_CONSTANT_HS);
+   gen7_upload_constant_state(brw, stage_state, active, _3DSTATE_CONSTANT_HS);
  }
  
  const struct brw_tracked_state gen7_tcs_push_constants = {
     .dirty = {
        .mesa  = _NEW_PROGRAM_CONSTANTS,
        .brw   = BRW_NEW_BATCH |
+               BRW_NEW_DEFAULT_TESS_LEVELS |
                 BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-               BRW_NEW_TESS_CTRL_PROGRAM |
+               BRW_NEW_TESS_PROGRAMS |
                 BRW_NEW_TCS_PROG_DATA,
     },
     .emit = gen7_upload_tcs_push_constants,
@@ -58,37 +60,58 @@ const struct brw_tracked_state gen7_tcs_push_constants = {
  static void
  gen7_upload_hs_state(struct brw_context *brw)
  {
-   /* Disable the HS Unit */
-   BEGIN_BATCH(7);
-   OUT_BATCH(_3DSTATE_CONSTANT_HS << 16 | (7 - 2));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
+   const struct brw_stage_state *stage_state = &brw->tcs.base;
+   /* BRW_NEW_TESS_PROGRAMS */
+   bool active = brw->tess_eval_program;
+   /* BRW_NEW_TCS_PROG_DATA */
+   const struct brw_vue_prog_data *prog_data = &brw->tcs.prog_data->base;
  
-   BEGIN_BATCH(7);
-   OUT_BATCH(_3DSTATE_HS << 16 | (7 - 2));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
-
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_HS << 16 | (2 - 2));
-   OUT_BATCH(brw->hw_bt_pool.next_offset);
-   ADVANCE_BATCH();
+   if (active) {
+      BEGIN_BATCH(7);
+      OUT_BATCH(_3DSTATE_HS << 16 | (7 - 2));
+      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
+                          GEN7_HS_SAMPLER_COUNT) |
+                SET_FIELD(prog_data->base.binding_table.size_bytes / 4,
+                          GEN7_HS_BINDING_TABLE_ENTRY_COUNT) |
+                (brw->max_hs_threads - 1));
+      OUT_BATCH(GEN7_HS_ENABLE |
+                GEN7_HS_STATISTICS_ENABLE |
+                SET_FIELD(brw->tcs.prog_data->instances - 1,
+                          GEN7_HS_INSTANCE_COUNT));
+      OUT_BATCH(stage_state->prog_offset);
+      if (prog_data->base.total_scratch) {
+         OUT_RELOC(stage_state->scratch_bo,
+                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                   ffs(prog_data->base.total_scratch) - 11);
+      } else {
+         OUT_BATCH(0);
+      }
+      OUT_BATCH(GEN7_HS_INCLUDE_VERTEX_HANDLES |
+                SET_FIELD(prog_data->base.dispatch_grf_start_reg,
+                          GEN7_HS_DISPATCH_START_GRF));
+      /* Ignore URB semaphores */
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(7);
+      OUT_BATCH(_3DSTATE_HS << 16 | (7 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+   brw->tcs.enabled = active;
  }
  
  const struct brw_tracked_state gen7_hs_state = {
     .dirty = {
        .mesa  = 0,
-      .brw   = BRW_NEW_CONTEXT,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_TCS_PROG_DATA |
+               BRW_NEW_TESS_PROGRAMS,
     },
     .emit = gen7_upload_hs_state,
  };
diff --git a/src/mesa/drivers/dri/i965/gen7_te_state.c b/src/mesa/drivers/dri/i965/gen7_te_state.c

index 2650fa5..f221307 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_te_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_te_state.c
@@ -29,10 +29,8 @@
  static void
  upload_te_state(struct brw_context *brw)
  {
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     bool active = brw->tess_eval_program;
-   if (active)
-      assert(brw->tess_ctrl_program);
  
     const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data;
  
@@ -61,7 +59,7 @@ const struct brw_tracked_state gen7_te_state = {
        .mesa  = 0,
        .brw   = BRW_NEW_CONTEXT |
                 BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_TESS_EVAL_PROGRAM,
+               BRW_NEW_TESS_PROGRAMS,
     },
     .emit = upload_te_state,
  };
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c

index 99a9d3c..00edbcc 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -34,7 +34,7 @@
   *   __________-__________   _________________-_________________
   *  /                     \ /                                   \
   * +-------------------------------------------------------------+
- * |     VS/FS/GS Push     |              VS/GS URB              |
+ * |  VS/HS/DS/GS/FS Push  |           VS/HS/DS/GS URB           |
   * |       Constants       |               Entries               |
   * +-------------------------------------------------------------+
   *
@@ -60,27 +60,32 @@
  static void
  gen7_allocate_push_constants(struct brw_context *brw)
  {
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   bool gs_present = brw->geometry_program;
+
+   /* BRW_NEW_TESS_PROGRAMS */
+   bool tess_present = brw->tess_eval_program;
+
     unsigned avail_size = 16;
     unsigned multiplier =
        (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1;
  
-   /* BRW_NEW_GEOMETRY_PROGRAM */
-   bool gs_present = brw->geometry_program;
+   int stages = 2 + gs_present + 2 * tess_present;
  
-   unsigned vs_size, gs_size;
-   if (gs_present) {
-      vs_size = avail_size / 3;
-      avail_size -= vs_size;
-      gs_size = avail_size / 2;
-      avail_size -= gs_size;
-   } else {
-      vs_size = avail_size / 2;
-      avail_size -= vs_size;
-      gs_size = 0;
-   }
-   unsigned fs_size = avail_size;
+   /* Divide up the available space equally between stages.  Because we
+    * round down (using floor division), there may be some left over
+    * space.  We allocate that to the pixel shader stage.
+    */
+   unsigned size_per_stage = avail_size / stages;
+
+   unsigned vs_size = size_per_stage;
+   unsigned hs_size = tess_present ? size_per_stage : 0;
+   unsigned ds_size = tess_present ? size_per_stage : 0;
+   unsigned gs_size = gs_present ? size_per_stage : 0;
+   unsigned fs_size = avail_size - size_per_stage * (stages - 1);
  
     gen7_emit_push_constant_state(brw, multiplier * vs_size,
+                                 multiplier * hs_size, multiplier * ds_size,
                                   multiplier * gs_size, multiplier * fs_size);
  
     /* From p115 of the Ivy Bridge PRM (3.2.1.4 3DSTATE_PUSH_CONSTANT_ALLOC_VS):
@@ -99,15 +104,24 @@ gen7_allocate_push_constants(struct brw_context *brw)
  
  void
  gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
+                              unsigned hs_size, unsigned ds_size,
                                unsigned gs_size, unsigned fs_size)
  {
     unsigned offset = 0;
  
-   BEGIN_BATCH(6);
+   BEGIN_BATCH(10);
     OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
     OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
     offset += vs_size;
  
+   OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_HS << 16 | (2 - 2));
+   OUT_BATCH(hs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   offset += hs_size;
+
+   OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_DS << 16 | (2 - 2));
+   OUT_BATCH(ds_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   offset += ds_size;
+
     OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2));
     OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
     offset += gs_size;
@@ -130,7 +144,9 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
  const struct brw_tracked_state gen7_push_constant_space = {
     .dirty = {
        .mesa = 0,
-      .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
+      .brw = BRW_NEW_CONTEXT |
+             BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_TESS_PROGRAMS,
     },
     .emit = gen7_allocate_push_constants,
  };
@@ -138,6 +154,7 @@ const struct brw_tracked_state gen7_push_constant_space = {
  static void
  gen7_upload_urb(struct brw_context *brw)
  {
+   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
     const int push_size_kB =
        (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16;
  
@@ -149,6 +166,15 @@ gen7_upload_urb(struct brw_context *brw)
     unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1;
     unsigned gs_entry_size_bytes = gs_size * 64;
  
+   /* BRW_NEW_TESS_PROGRAMS */
+   const bool tess_present = brw->tess_eval_program;
+   /* BRW_NEW_TCS_PROG_DATA */
+   unsigned hs_size = tess_present ? brw->tcs.prog_data->base.urb_entry_size : 1;
+   unsigned hs_entry_size_bytes = hs_size * 64;
+   /* BRW_NEW_TES_PROG_DATA */
+   unsigned ds_size = tess_present ? brw->tes.prog_data->base.urb_entry_size : 1;
+   unsigned ds_entry_size_bytes = ds_size * 64;
+
     /* If we're just switching between programs with the same URB requirements,
      * skip the rest of the logic.
      */
@@ -156,21 +182,29 @@ gen7_upload_urb(struct brw_context *brw)
         !(brw->ctx.NewDriverState & BRW_NEW_URB_SIZE) &&
         brw->urb.vsize == vs_size &&
         brw->urb.gs_present == gs_present &&
-       brw->urb.gsize == gs_size) {
+       brw->urb.gsize == gs_size &&
+       brw->urb.tess_present == tess_present &&
+       brw->urb.hsize == hs_size &&
+       brw->urb.dsize == ds_size) {
        return;
     }
     brw->urb.vsize = vs_size;
     brw->urb.gs_present = gs_present;
     brw->urb.gsize = gs_size;
+   brw->urb.tess_present = tess_present;
+   brw->urb.hsize = hs_size;
+   brw->urb.dsize = ds_size;
  
     /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
      *
      *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
      *     Allocation Size is less than 9 512-bit URB entries.
      *
-    * Similar text exists for GS.
+    * Similar text exists for HS, DS and GS.
      */
     unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
+   unsigned hs_granularity = (hs_size < 9) ? 8 : 1;
+   unsigned ds_granularity = (ds_size < 9) ? 8 : 1;
     unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
  
     /* URB allocations must be done in 8k chunks. */
@@ -191,13 +225,20 @@ gen7_upload_urb(struct brw_context *brw)
      * additional space it could actually make use of).
      */
  
-   /* VS has a lower limit on the number of URB entries */
+   /* VS has a lower limit on the number of URB entries.
+    *
+    * From the Broadwell PRM, 3DSTATE_URB_VS instruction:
+    * "When tessellation is enabled, the VS Number of URB Entries must be
+    *  greater than or equal to 192."
+    */
+   unsigned vs_min_entries =
+      tess_present && brw->gen == 8 ? 192 : brw->urb.min_vs_entries;
+
     unsigned vs_chunks =
-      ALIGN(brw->urb.min_vs_entries * vs_entry_size_bytes, chunk_size_bytes) /
-      chunk_size_bytes;
+      DIV_ROUND_UP(vs_min_entries * vs_entry_size_bytes, chunk_size_bytes);
     unsigned vs_wants =
-      ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
-            chunk_size_bytes) / chunk_size_bytes - vs_chunks;
+      DIV_ROUND_UP(brw->urb.max_vs_entries * vs_entry_size_bytes,
+                   chunk_size_bytes) - vs_chunks;
  
     unsigned gs_chunks = 0;
     unsigned gs_wants = 0;
@@ -210,21 +251,42 @@ gen7_upload_urb(struct brw_context *brw)
         *
         * (2) We can't allocate less than nr_gs_entries_granularity.
         */
-      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
-                        chunk_size_bytes) / chunk_size_bytes;
-      gs_wants =
-         ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes,
-               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
+      gs_chunks = DIV_ROUND_UP(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
+                               chunk_size_bytes);
+      gs_wants = DIV_ROUND_UP(brw->urb.max_gs_entries * gs_entry_size_bytes,
+                              chunk_size_bytes) - gs_chunks;
+   }
+
+   unsigned hs_chunks = 0;
+   unsigned hs_wants = 0;
+   unsigned ds_chunks = 0;
+   unsigned ds_wants = 0;
+
+   if (tess_present) {
+      hs_chunks =
+         DIV_ROUND_UP(hs_granularity * hs_entry_size_bytes,
+                      chunk_size_bytes);
+      hs_wants =
+         DIV_ROUND_UP(devinfo->urb.max_hs_entries * hs_entry_size_bytes,
+                      chunk_size_bytes) - hs_chunks;
+
+      ds_chunks =
+         DIV_ROUND_UP(devinfo->urb.min_ds_entries * ds_entry_size_bytes,
+                      chunk_size_bytes);
+      ds_wants =
+         DIV_ROUND_UP(brw->urb.max_ds_entries * ds_entry_size_bytes,
+                      chunk_size_bytes) - ds_chunks;
     }
  
     /* There should always be enough URB space to satisfy the minimum
      * requirements of each stage.
      */
-   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
+   unsigned total_needs = push_constant_chunks +
+                          vs_chunks + hs_chunks + ds_chunks + gs_chunks;
     assert(total_needs <= urb_chunks);
  
     /* Mete out remaining space (if any) in proportion to "wants". */
-   unsigned total_wants = vs_wants + gs_wants;
+   unsigned total_wants = vs_wants + hs_wants + ds_wants + gs_wants;
     unsigned remaining_space = urb_chunks - total_needs;
     if (remaining_space > total_wants)
        remaining_space = total_wants;
@@ -233,61 +295,100 @@ gen7_upload_urb(struct brw_context *brw)
           roundf(vs_wants * (((float) remaining_space) / total_wants));
        vs_chunks += vs_additional;
        remaining_space -= vs_additional;
+      total_wants -= vs_wants;
+
+      unsigned hs_additional = (unsigned)
+         round(hs_wants * (((double) remaining_space) / total_wants));
+      hs_chunks += hs_additional;
+      remaining_space -= hs_additional;
+      total_wants -= hs_wants;
+
+      unsigned ds_additional = (unsigned)
+         round(ds_wants * (((double) remaining_space) / total_wants));
+      ds_chunks += ds_additional;
+      remaining_space -= ds_additional;
+      total_wants -= ds_wants;
+
        gs_chunks += remaining_space;
     }
  
     /* Sanity check that we haven't over-allocated. */
-   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
+   assert(push_constant_chunks +
+          vs_chunks + hs_chunks + ds_chunks + gs_chunks <= urb_chunks);
  
     /* Finally, compute the number of entries that can fit in the space
      * allocated to each stage.
      */
     unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
+   unsigned nr_hs_entries = hs_chunks * chunk_size_bytes / hs_entry_size_bytes;
+   unsigned nr_ds_entries = ds_chunks * chunk_size_bytes / ds_entry_size_bytes;
     unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
  
     /* Since we rounded up when computing *_wants, this may be slightly more
      * than the maximum allowed amount, so correct for that.
      */
     nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
+   nr_hs_entries = MIN2(nr_hs_entries, brw->urb.max_hs_entries);
+   nr_ds_entries = MIN2(nr_ds_entries, brw->urb.max_ds_entries);
     nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);
  
     /* Ensure that we program a multiple of the granularity. */
     nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
+   nr_hs_entries = ROUND_DOWN_TO(nr_hs_entries, hs_granularity);
+   nr_ds_entries = ROUND_DOWN_TO(nr_ds_entries, ds_granularity);
     nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
  
     /* Finally, sanity check to make sure we have at least the minimum number
      * of entries needed for each stage.
      */
-   assert(nr_vs_entries >= brw->urb.min_vs_entries);
+   assert(nr_vs_entries >= vs_min_entries);
     if (gs_present)
        assert(nr_gs_entries >= 2);
+   if (tess_present) {
+      assert(nr_hs_entries >= 1);
+      assert(nr_ds_entries >= devinfo->urb.min_ds_entries);
+   }
  
     /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems
      * better to put reasonable data in there rather than leave them
      * uninitialized.
      */
     brw->urb.nr_vs_entries = nr_vs_entries;
+   brw->urb.nr_hs_entries = nr_hs_entries;
+   brw->urb.nr_ds_entries = nr_ds_entries;
     brw->urb.nr_gs_entries = nr_gs_entries;
  
     /* Lay out the URB in the following order:
      * - push constants
      * - VS
+    * - HS
+    * - DS
      * - GS
      */
     brw->urb.vs_start = push_constant_chunks;
-   brw->urb.gs_start = push_constant_chunks + vs_chunks;
+   brw->urb.hs_start = push_constant_chunks + vs_chunks;
+   brw->urb.ds_start = push_constant_chunks + vs_chunks + hs_chunks;
+   brw->urb.gs_start = push_constant_chunks + vs_chunks + hs_chunks +
+                       ds_chunks;
  
     if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
        gen7_emit_vs_workaround_flush(brw);
     gen7_emit_urb_state(brw,
                         brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
+                       brw->urb.nr_hs_entries, hs_size, brw->urb.hs_start,
+                       brw->urb.nr_ds_entries, ds_size, brw->urb.ds_start,
                         brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start);
  }
  
  void
  gen7_emit_urb_state(struct brw_context *brw,
-                    unsigned nr_vs_entries, unsigned vs_size,
-                    unsigned vs_start, unsigned nr_gs_entries,
+                    unsigned nr_vs_entries,
+                    unsigned vs_size, unsigned vs_start,
+                    unsigned nr_hs_entries,
+                    unsigned hs_size, unsigned hs_start,
+                    unsigned nr_ds_entries,
+                    unsigned ds_size, unsigned ds_start,
+                    unsigned nr_gs_entries,
                      unsigned gs_size, unsigned gs_start)
  {
     BEGIN_BATCH(8);
@@ -301,14 +402,15 @@ gen7_emit_urb_state(struct brw_context *brw,
               ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
               (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
  
-   /* Allocate the HS and DS zero space - we don't use them. */
     OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
-   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-             (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
+   OUT_BATCH(nr_hs_entries |
+             ((hs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
+             (hs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
  
     OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2));
-   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-             (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
+   OUT_BATCH(nr_ds_entries |
+             ((ds_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
+             (ds_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
     ADVANCE_BATCH();
  }
  
@@ -318,7 +420,10 @@ const struct brw_tracked_state gen7_urb = {
        .brw = BRW_NEW_CONTEXT |
               BRW_NEW_URB_SIZE |
               BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_TESS_PROGRAMS |
               BRW_NEW_GS_PROG_DATA |
+             BRW_NEW_TCS_PROG_DATA |
+             BRW_NEW_TES_PROG_DATA |
               BRW_NEW_VS_PROG_DATA,
     },
     .emit = gen7_upload_urb,
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c

index 06d5e65..7def5f5 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -77,14 +77,11 @@ upload_wm_state(struct brw_context *brw)
        dw1 |= GEN7_WM_KILL_ENABLE;
     }
  
-   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx)) {
-      dw1 |= GEN7_WM_DISPATCH_ENABLE;
-   }
-
     /* _NEW_BUFFERS | _NEW_COLOR */
+   const bool active_fs_has_side_effects =
+      _mesa_active_fragment_shader_has_side_effects(&brw->ctx);
     if (brw_color_buffer_write_enabled(brw) || writes_depth ||
-       prog_data->base.nr_image_params ||
-       dw1 & GEN7_WM_KILL_ENABLE) {
+       active_fs_has_side_effects || dw1 & GEN7_WM_KILL_ENABLE) {
        dw1 |= GEN7_WM_DISPATCH_ENABLE;
     }
     if (multisampled_fbo) {
@@ -110,7 +107,7 @@ upload_wm_state(struct brw_context *brw)
     /* BRW_NEW_FS_PROG_DATA */
     if (prog_data->early_fragment_tests)
        dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
-   else if (prog_data->base.nr_image_params)
+   else if (active_fs_has_side_effects)
        dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
  
     /* The "UAV access enable" bits are unnecessary on HSW because they only
@@ -123,7 +120,7 @@ upload_wm_state(struct brw_context *brw)
      */
     if (brw->is_haswell &&
         !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
-       prog_data->base.nr_image_params)
+       active_fs_has_side_effects)
        dw2 |= HSW_WM_UAV_ONLY;
  
     BEGIN_BATCH(3);
diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c

index 198d612..ff89e5f 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
@@ -115,7 +115,12 @@ gen8_emit_vertices(struct brw_context *brw)
     }
  
     /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
-   unsigned nr_buffers = brw->vb.nr_buffers + brw->vs.prog_data->uses_vertexid;
+   const bool uses_draw_params =
+      brw->vs.prog_data->uses_basevertex ||
+      brw->vs.prog_data->uses_baseinstance;
+   const unsigned nr_buffers = brw->vb.nr_buffers +
+      uses_draw_params + brw->vs.prog_data->uses_drawid;
+
     if (nr_buffers) {
        assert(nr_buffers <= 33);
  
@@ -135,7 +140,7 @@ gen8_emit_vertices(struct brw_context *brw)
           OUT_BATCH(buffer->bo->size);
        }
  
-      if (brw->vs.prog_data->uses_vertexid) {
+      if (uses_draw_params) {
           OUT_BATCH(brw->vb.nr_buffers << GEN6_VB0_INDEX_SHIFT |
                     GEN7_VB0_ADDRESS_MODIFYENABLE |
                     mocs_wb << 16);
@@ -143,21 +148,33 @@ gen8_emit_vertices(struct brw_context *brw)
                       brw->draw.draw_params_offset);
           OUT_BATCH(brw->draw.draw_params_bo->size);
        }
+
+      if (brw->vs.prog_data->uses_drawid) {
+         OUT_BATCH((brw->vb.nr_buffers + 1) << GEN6_VB0_INDEX_SHIFT |
+                   GEN7_VB0_ADDRESS_MODIFYENABLE |
+                   mocs_wb << 16);
+         OUT_RELOC64(brw->draw.draw_id_bo, I915_GEM_DOMAIN_VERTEX, 0,
+                     brw->draw.draw_id_offset);
+         OUT_BATCH(brw->draw.draw_id_bo->size);
+      }
        ADVANCE_BATCH();
     }
  
     /* Normally we don't need an element for the SGVS attribute because the
      * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an
-    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if the
-    * vertex ID is used then it needs an element for the base vertex buffer.
-    * Additionally if there is an edge flag element then the SGVS can't be
-    * inserted past that so we need a dummy element to ensure that the edge
-    * flag is the last one.
+    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if
+    * we're using draw parameters then we need an element for the those
+    * values.  Additionally if there is an edge flag element then the SGVS
+    * can't be inserted past that so we need a dummy element to ensure that
+    * the edge flag is the last one.
      */
-   bool needs_sgvs_element = (brw->vs.prog_data->uses_vertexid ||
-                              (brw->vs.prog_data->uses_instanceid &&
-                               uses_edge_flag));
-   unsigned nr_elements = brw->vb.nr_enabled + needs_sgvs_element;
+   const bool needs_sgvs_element = (brw->vs.prog_data->uses_basevertex ||
+                                    brw->vs.prog_data->uses_baseinstance ||
+                                    ((brw->vs.prog_data->uses_instanceid ||
+                                      brw->vs.prog_data->uses_vertexid) &&
+                                     uses_edge_flag));
+   const unsigned nr_elements =
+      brw->vb.nr_enabled + needs_sgvs_element + brw->vs.prog_data->uses_drawid;
  
     /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
      * presumably for VertexID/InstanceID.
@@ -212,12 +229,13 @@ gen8_emit_vertices(struct brw_context *brw)
     }
  
     if (needs_sgvs_element) {
-      if (brw->vs.prog_data->uses_vertexid) {
+      if (brw->vs.prog_data->uses_basevertex ||
+          brw->vs.prog_data->uses_baseinstance) {
           OUT_BATCH(GEN6_VE0_VALID |
                     brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
-                   BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
+                   BRW_SURFACEFORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT);
           OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
-                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT) |
                     (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
                     (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
        } else {
@@ -229,6 +247,16 @@ gen8_emit_vertices(struct brw_context *brw)
        }
     }
  
+   if (brw->vs.prog_data->uses_drawid) {
+      OUT_BATCH(GEN6_VE0_VALID |
+                ((brw->vb.nr_buffers + 1) << GEN6_VE0_INDEX_SHIFT) |
+                (BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT));
+      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
+   }
+
     if (gen6_edgeflag_input) {
        uint32_t format =
           brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
@@ -266,6 +294,15 @@ gen8_emit_vertices(struct brw_context *brw)
        OUT_BATCH(buffer->step_rate);
        ADVANCE_BATCH();
     }
+
+   if (brw->vs.prog_data->uses_drawid) {
+      const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
+      BEGIN_BATCH(3);
+      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
+      OUT_BATCH(element);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
  }
  
  const struct brw_tracked_state gen8_vertices = {
diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c

index a79e8aa..d91eb77 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen8_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c
@@ -31,9 +31,8 @@ gen8_upload_ds_state(struct brw_context *brw)
  {
     struct gl_context *ctx = &brw->ctx;
     const struct brw_stage_state *stage_state = &brw->tes.base;
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
     bool active = brw->tess_eval_program;
-   assert(!active || brw->tess_ctrl_program);
  
     /* BRW_NEW_TES_PROG_DATA */
     const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data;
@@ -92,7 +91,7 @@ const struct brw_tracked_state gen8_ds_state = {
     .dirty = {
        .mesa  = 0,
        .brw   = BRW_NEW_BATCH |
-               BRW_NEW_TESS_EVAL_PROGRAM |
+               BRW_NEW_TESS_PROGRAMS |
                 BRW_NEW_TES_PROG_DATA,
     },
     .emit = gen8_upload_ds_state,
diff --git a/src/mesa/drivers/dri/i965/gen8_hs_state.c b/src/mesa/drivers/dri/i965/gen8_hs_state.c

index 38e2235..21f3d46 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen8_hs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_hs_state.c
@@ -30,9 +30,8 @@ static void
  gen8_upload_hs_state(struct brw_context *brw)
  {
     const struct brw_stage_state *stage_state = &brw->tcs.base;
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
-   bool active = brw->tess_ctrl_program;
-   assert(!active || brw->tess_eval_program);
+   /* BRW_NEW_TESS_PROGRAMS */
+   bool active = brw->tess_eval_program;
     /* BRW_NEW_HS_PROG_DATA */
     const struct brw_vue_prog_data *prog_data = &brw->tcs.prog_data->base;
  
@@ -84,7 +83,7 @@ const struct brw_tracked_state gen8_hs_state = {
        .mesa  = 0,
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_TCS_PROG_DATA |
-               BRW_NEW_TESS_CTRL_PROGRAM,
+               BRW_NEW_TESS_PROGRAMS,
     },
     .emit = gen8_upload_hs_state,
  };
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c

index 945f710..74cdcef 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -90,8 +90,7 @@ gen8_upload_ps_extra(struct brw_context *brw,
      *
      * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR
      */
-   if ((_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) ||
-        prog_data->base.nr_image_params) &&
+   if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx) &&
         !brw_color_buffer_write_enabled(brw))
        dw1 |= GEN8_PSX_SHADER_HAS_UAV;
  
@@ -157,7 +156,7 @@ upload_wm_state(struct brw_context *brw)
     /* BRW_NEW_FS_PROG_DATA */
     if (brw->wm.prog_data->early_fragment_tests)
        dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
-   else if (brw->wm.prog_data->base.nr_image_params)
+   else if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx))
        dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
  
     BEGIN_BATCH(2);
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c

index bd204aa..6d29fbd 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -406,11 +406,6 @@ can_fast_copy_blit(struct brw_context *brw,
     if (brw->gen < 9)
        return false;
  
-   if (src_buffer->handle == dst_buffer->handle &&
-       _mesa_regions_overlap(src_x, src_y, src_x + w, src_y + h,
-                             dst_x, dst_y, dst_x + w, dst_y + h))
-      return false;
-
     /* Enable fast copy blit only if the surfaces are Yf/Ys tiled.
      * FIXME: Based on performance data, remove this condition later to
      * enable for all types of surfaces.
@@ -427,8 +422,10 @@ can_fast_copy_blit(struct brw_context *brw,
     if ((dst_offset | src_offset) & 63)
        return false;
  
-   /* Color depth greater than 128 bits not supported. */
-   if (cpp > 16)
+   /* Color depths which are not power of 2 or greater than 128 bits are
+    * not supported.
+    */
+   if (!_mesa_is_pow_two(cpp) || cpp > 16)
        return false;
  
     /* For Fast Copy Blits the pitch cannot be a negative number. So, bit 15
@@ -567,9 +564,10 @@ intelEmitCopyBlit(struct brw_context *brw,
                                             dst_offset, dst_pitch,
                                             dst_tiling, dst_tr_mode,
                                             w, h, cpp);
-   assert(use_fast_copy_blit ||
-          (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
-           dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE));
+   if (!use_fast_copy_blit &&
+       (src_tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
+        dst_tr_mode != INTEL_MIPTREE_TRMODE_NONE))
+      return false;
  
     if (use_fast_copy_blit) {
        /* When two sequential fast copy blits have different source surfaces,
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c

index 7a5b3fc..56da2da 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -167,7 +167,7 @@ brw_delete_buffer(struct gl_context * ctx, struct gl_buffer_object *obj)
     _mesa_buffer_unmap_all_mappings(ctx, obj);
  
     drm_intel_bo_unreference(intel_obj->buffer);
-   free(intel_obj);
+   _mesa_delete_buffer_object(ctx, obj);
  }
  
  
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c

index 24761a7..889f7cb 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -203,6 +203,7 @@ intelInitExtensions(struct gl_context *ctx)
     ctx->Extensions.ARB_point_sprite = true;
     ctx->Extensions.ARB_seamless_cube_map = true;
     ctx->Extensions.ARB_shader_bit_encoding = true;
+   ctx->Extensions.ARB_shader_draw_parameters = true;
     ctx->Extensions.ARB_shader_texture_lod = true;
     ctx->Extensions.ARB_shadow = true;
     ctx->Extensions.ARB_sync = true;
@@ -333,6 +334,7 @@ intelInitExtensions(struct gl_context *ctx)
        ctx->Extensions.ARB_shader_image_load_store = true;
        ctx->Extensions.ARB_shader_image_size = true;
        ctx->Extensions.ARB_shader_texture_image_samples = true;
+      ctx->Extensions.ARB_tessellation_shader = true;
        ctx->Extensions.ARB_texture_compression_bptc = true;
        ctx->Extensions.ARB_texture_view = true;
        ctx->Extensions.ARB_shader_storage_buffer_object = true;
@@ -344,6 +346,9 @@ intelInitExtensions(struct gl_context *ctx)
           ctx->Extensions.ARB_transform_feedback3 = true;
           ctx->Extensions.ARB_transform_feedback_instanced = true;
  
+         if (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024)
+            ctx->Extensions.ARB_compute_shader = true;
+
           if (brw->intelScreen->cmd_parser_version >= 2)
              brw->predicate.supported = true;
        }
@@ -355,8 +360,6 @@ intelInitExtensions(struct gl_context *ctx)
           ctx->Extensions.ARB_viewport_array = true;
           ctx->Extensions.AMD_vertex_shader_viewport_index = true;
           ctx->Extensions.ARB_shader_subroutine = true;
-         if (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024)
-            ctx->Extensions.ARB_compute_shader = true;
        }
     }
  
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c

index 88c0a19..108dd87 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2697,13 +2697,17 @@ use_intel_mipree_map_blit(struct brw_context *brw,
  {
     if (brw->has_llc &&
        /* It's probably not worth swapping to the blit ring because of
-       * all the overhead involved.
+       * all the overhead involved. But, we must use blitter for the
+       * surfaces with INTEL_MIPTREE_TRMODE_{YF,YS}.
         */
-       !(mode & GL_MAP_WRITE_BIT) &&
+       (!(mode & GL_MAP_WRITE_BIT) ||
+        mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) &&
         !mt->compressed &&
         (mt->tiling == I915_TILING_X ||
          /* Prior to Sandybridge, the blitter can't handle Y tiling */
-        (brw->gen >= 6 && mt->tiling == I915_TILING_Y)) &&
+        (brw->gen >= 6 && mt->tiling == I915_TILING_Y) ||
+        /* Fast copy blit on skl+ supports all tiling formats. */
+        brw->gen >= 9) &&
         can_blit_slice(mt, level, slice))
        return true;
  
@@ -2772,6 +2776,8 @@ intel_miptree_map(struct brw_context *brw,
        intel_miptree_map_movntdqa(brw, mt, map, level, slice);
  #endif
     } else {
+      /* intel_miptree_map_gtt() doesn't support surfaces with Yf/Ys tiling. */
+      assert(mt->tr_mode == INTEL_MIPTREE_TRMODE_NONE);
        intel_miptree_map_gtt(brw, mt, map, level, slice);
     }
  
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_copy.c b/src/mesa/drivers/dri/i965/intel_pixel_copy.c

index 3b5bdb8..05c35bd 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
@@ -142,7 +142,7 @@ do_blit_copypixels(struct gl_context * ctx,
     }
  
     if (ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F) {
-      perf_debug("glCopyPixles(): Unsupported pixel zoom\n");
+      perf_debug("glCopyPixels(): Unsupported pixel zoom\n");
        return false;
     }
  
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c

index 825a7c1..e1e1e62 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -999,13 +999,14 @@ intelCreateBuffer(__DRIscreen * driScrnPriv,
        fb->Visual.samples = num_samples;
     }
  
-   if (mesaVis->redBits == 5) {
+   if (mesaVis->redBits == 5)
        rgbFormat = MESA_FORMAT_B5G6R5_UNORM;
-   } else {
-      if (mesaVis->alphaBits == 0)
-         rgbFormat = MESA_FORMAT_B8G8R8X8_SRGB;
-      else
-         rgbFormat = MESA_FORMAT_B8G8R8A8_SRGB;
+   else if (mesaVis->sRGBCapable)
+      rgbFormat = MESA_FORMAT_B8G8R8A8_SRGB;
+   else if (mesaVis->alphaBits == 0)
+      rgbFormat = MESA_FORMAT_B8G8R8X8_UNORM;
+   else {
+      rgbFormat = MESA_FORMAT_B8G8R8A8_SRGB;
        fb->Visual.sRGBCapable = true;
     }
  
@@ -1338,6 +1339,11 @@ set_max_gl_versions(struct intel_screen *screen)
     switch (screen->devinfo->gen) {
     case 9:
     case 8:
+      psp->max_gl_core_version = 33;
+      psp->max_gl_compat_version = 30;
+      psp->max_gl_es1_version = 11;
+      psp->max_gl_es2_version = 31;
+      break;
     case 7:
     case 6:
        psp->max_gl_core_version = 33;
@@ -1443,8 +1449,6 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
     if (INTEL_DEBUG & DEBUG_AUB)
        drm_intel_bufmgr_gem_set_aub_dump(intelScreen->bufmgr, true);
  
-   intelScreen->hw_must_use_separate_stencil = intelScreen->devinfo->gen >= 7;
-
     intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
     intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
  
@@ -1491,6 +1495,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
  
     intelScreen->compiler = brw_compiler_create(intelScreen,
                                                 intelScreen->devinfo);
+   intelScreen->program_id = 1;
  
     if (intelScreen->devinfo->has_resource_streamer) {
        int val = -1;
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h

index 96bb995..3a5f22c 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -46,8 +46,6 @@ struct intel_screen
  
     bool no_hw;
  
-   bool hw_must_use_separate_stencil;
-
     bool hw_has_swizzling;
  
     int hw_has_timestamp;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_driver.c b/src/mesa/drivers/dri/nouveau/nouveau_driver.c

index 7f31b28..998e751 100644 (file)
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
@@ -35,7 +35,7 @@
  
  #include "drivers/common/meta.h"
  
-const char const *nouveau_vendor_string = "Nouveau";
+const char * const nouveau_vendor_string = "Nouveau";
  
  const char *
  nouveau_get_renderer_string(unsigned chipset)
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_driver.h b/src/mesa/drivers/dri/nouveau/nouveau_driver.h

index a4273a5..237e956 100644 (file)
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.h
@@ -69,7 +69,7 @@ struct nouveau_driver {
  #define nouveau_error(format, ...) \
         fprintf(stderr, "%s: " format, __func__, ## __VA_ARGS__)
  
-extern const char const *nouveau_vendor_string;
+extern const char * const nouveau_vendor_string;
  
  const char *
  nouveau_get_renderer_string(unsigned chipset);
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.c b/src/mesa/drivers/dri/nouveau/nouveau_screen.c

index 153f18e..6f61f66 100644 (file)
--- a/src/mesa/drivers/dri/nouveau/nouveau_screen.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.c
@@ -40,6 +40,9 @@
  #include "main/renderbuffer.h"
  #include "swrast/s_renderbuffer.h"
  
+#include <nvif/class.h>
+#include <nvif/cl0080.h>
+
  static const __DRIextension *nouveau_screen_extensions[];
  
  static void
@@ -99,12 +102,22 @@ nouveau_init_screen2(__DRIscreen *dri_screen)
         dri_screen->driverPrivate = screen;
  
         /* Open the DRM device. */
-       ret = nouveau_device_wrap(dri_screen->fd, 0, &screen->device);
+       ret = nouveau_drm_new(dri_screen->fd, &screen->drm);
         if (ret) {
                 nouveau_error("Error opening the DRM device.\n");
                 goto fail;
         }
  
+       ret = nouveau_device_new(&screen->drm->client, NV_DEVICE,
+                                &(struct nv_device_v0) {
+                                       .device = ~0ULL,
+                                }, sizeof(struct nv_device_v0),
+                                &screen->device);
+       if (ret) {
+               nouveau_error("Error creating device object.\n");
+               goto fail;
+       }
+
         /* Choose the card specific function pointers. */
         switch (screen->device->chipset & 0xf0) {
         case 0x00:
@@ -213,6 +226,7 @@ nouveau_destroy_screen(__DRIscreen *dri_screen)
                 return;
  
         nouveau_device_del(&screen->device);
+       nouveau_drm_del(&screen->drm);
  
         free(screen);
         dri_screen->driverPrivate = NULL;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.h b/src/mesa/drivers/dri/nouveau/nouveau_screen.h

index 45b1ee9..e3c1928 100644 (file)
--- a/src/mesa/drivers/dri/nouveau/nouveau_screen.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.h
@@ -33,6 +33,7 @@ struct nouveau_context;
  
  struct nouveau_screen {
         __DRIscreen *dri_screen;
+       struct nouveau_drm *drm;
         struct nouveau_device *device;
         const struct nouveau_driver *driver;
  };
diff --git a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c

index d9d4f5f..2b76305 100644 (file)
--- a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
+++ b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
@@ -71,7 +71,7 @@ radeonDeleteBufferObject(struct gl_context * ctx,
          radeon_bo_unref(radeon_obj->bo);
      }
  
-    free(radeon_obj);
+    _mesa_delete_buffer_object(ctx, obj);
  }
  
  
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c

index 5c7dcac..8462ab6 100644 (file)
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -645,10 +645,100 @@ GLAPI OSMesaContext GLAPIENTRY
  OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits,
                          GLint accumBits, OSMesaContext sharelist )
  {
+   int attribs[100], n = 0;
+
+   attribs[n++] = OSMESA_FORMAT;
+   attribs[n++] = format;
+   attribs[n++] = OSMESA_DEPTH_BITS;
+   attribs[n++] = depthBits;
+   attribs[n++] = OSMESA_STENCIL_BITS;
+   attribs[n++] = stencilBits;
+   attribs[n++] = OSMESA_ACCUM_BITS;
+   attribs[n++] = accumBits;
+   attribs[n++] = 0;
+
+   return OSMesaCreateContextAttribs(attribs, sharelist);
+}
+
+
+/**
+ * New in Mesa 11.2
+ *
+ * Create context with attribute list.
+ */
+GLAPI OSMesaContext GLAPIENTRY
+OSMesaCreateContextAttribs(const int *attribList, OSMesaContext sharelist)
+{
     OSMesaContext osmesa;
     struct dd_function_table functions;
     GLint rind, gind, bind, aind;
     GLint redBits = 0, greenBits = 0, blueBits = 0, alphaBits =0;
+   GLenum format = OSMESA_RGBA;
+   GLint depthBits = 0, stencilBits = 0, accumBits = 0;
+   int profile = OSMESA_COMPAT_PROFILE, version_major = 1, version_minor = 0;
+   gl_api api_profile = API_OPENGL_COMPAT;
+   int i;
+
+   for (i = 0; attribList[i]; i += 2) {
+      switch (attribList[i]) {
+      case OSMESA_FORMAT:
+         format = attribList[i+1];
+         switch (format) {
+         case OSMESA_COLOR_INDEX:
+         case OSMESA_RGBA:
+         case OSMESA_BGRA:
+         case OSMESA_ARGB:
+         case OSMESA_RGB:
+         case OSMESA_BGR:
+         case OSMESA_RGB_565:
+            /* legal */
+            break;
+         default:
+            return NULL;
+         }
+         break;
+      case OSMESA_DEPTH_BITS:
+         depthBits = attribList[i+1];
+         if (depthBits < 0)
+            return NULL;
+         break;
+      case OSMESA_STENCIL_BITS:
+         stencilBits = attribList[i+1];
+         if (stencilBits < 0)
+            return NULL;
+         break;
+      case OSMESA_ACCUM_BITS:
+         accumBits = attribList[i+1];
+         if (accumBits < 0)
+            return NULL;
+         break;
+      case OSMESA_PROFILE:
+         profile = attribList[i+1];
+         if (profile == OSMESA_COMPAT_PROFILE)
+            api_profile = API_OPENGL_COMPAT;
+         else if (profile == OSMESA_CORE_PROFILE)
+            api_profile = API_OPENGL_CORE;
+         else
+            return NULL;
+         break;
+      case OSMESA_CONTEXT_MAJOR_VERSION:
+         version_major = attribList[i+1];
+         if (version_major < 1)
+            return NULL;
+         break;
+      case OSMESA_CONTEXT_MINOR_VERSION:
+         version_minor = attribList[i+1];
+         if (version_minor < 0)
+            return NULL;
+         break;
+      case 0:
+         /* end of list */
+         break;
+      default:
+         fprintf(stderr, "Bad attribute in OSMesaCreateContextAttribs()\n");
+         return NULL;
+      }
+   }
  
     rind = gind = bind = aind = 0;
     if (format==OSMESA_RGBA) {
@@ -742,7 +832,7 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits,
        functions.UpdateState = osmesa_update_state;
  
        if (!_mesa_initialize_context(&osmesa->mesa,
-                                    API_OPENGL_COMPAT,
+                                    api_profile,
                                      osmesa->gl_visual,
                                      sharelist ? &sharelist->mesa
                                                : (struct gl_context *) NULL,
@@ -819,6 +909,13 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits,
  
           _mesa_compute_version(ctx);
  
+         if (ctx->Version < version_major * 10 + version_minor) {
+            _mesa_destroy_visual(osmesa->gl_visual);
+            _mesa_free_context_data(ctx);
+            free(osmesa);
+            return NULL;
+         }
+
           /* Exec table initialization requires the version to be computed */
           _mesa_initialize_dispatch_tables(ctx);
           _mesa_initialize_vbo_vtxfmt(ctx);
@@ -1121,6 +1218,7 @@ struct name_function
  static struct name_function functions[] = {
     { "OSMesaCreateContext", (OSMESAproc) OSMesaCreateContext },
     { "OSMesaCreateContextExt", (OSMESAproc) OSMesaCreateContextExt },
+   { "OSMesaCreateContextAttribs", (OSMESAproc) OSMesaCreateContextAttribs },
     { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
     { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
     { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
diff --git a/src/mesa/main/api_arrayelt.c b/src/mesa/main/api_arrayelt.c

index 92d8238..c84db5f 100644 (file)
--- a/src/mesa/main/api_arrayelt.c
+++ b/src/mesa/main/api_arrayelt.c
@@ -65,7 +65,7 @@ typedef struct {
  typedef struct {
     AEarray arrays[32];
     AEattrib attribs[VERT_ATTRIB_MAX + 1];
-   GLuint NewState;
+   GLbitfield NewState;
  
     /* List of VBOs we need to map before executing ArrayElements */
     struct gl_buffer_object *vbo[VERT_ATTRIB_MAX];
@@ -1802,7 +1802,7 @@ _ae_ArrayElement(GLint elt)
  
  
  void
-_ae_invalidate_state(struct gl_context *ctx, GLuint new_state)
+_ae_invalidate_state(struct gl_context *ctx, GLbitfield new_state)
  {
     AEcontext *actx = AE_CONTEXT(ctx);
  
diff --git a/src/mesa/main/api_arrayelt.h b/src/mesa/main/api_arrayelt.h

index 39fdeb9..03cd9ec 100644 (file)
--- a/src/mesa/main/api_arrayelt.h
+++ b/src/mesa/main/api_arrayelt.h
@@ -33,7 +33,7 @@
  
  extern GLboolean _ae_create_context( struct gl_context *ctx );
  extern void _ae_destroy_context( struct gl_context *ctx );
-extern void _ae_invalidate_state( struct gl_context *ctx, GLuint new_state );
+extern void _ae_invalidate_state( struct gl_context *ctx, GLbitfield new_state );
  extern void GLAPIENTRY _ae_ArrayElement( GLint elt );
  
  /* May optionally be called before a batch of element calls:
diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c

index a7fd82c..8b63d9c 100644 (file)
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -629,7 +629,10 @@ _mesa_Vertex2sv( const GLshort *v )
  void GLAPIENTRY
  _mesa_Vertex3dv( const GLdouble *v )
  {
-   VERTEX3( (GLfloat) v[0], (GLfloat) v[1], (GLfloat) v[2] );
+   if (v[2] == 0.0)
+      VERTEX2( (GLfloat) v[0], (GLfloat) v[1] );
+   else
+      VERTEX3( (GLfloat) v[0], (GLfloat) v[1], (GLfloat) v[2] );
  }
  
  void GLAPIENTRY
diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c

index d693ec6..2b62997 100644 (file)
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -920,6 +920,121 @@ _mesa_validate_MultiDrawElementsIndirect(struct gl_context *ctx,
     return GL_TRUE;
  }
  
+static GLboolean
+valid_draw_indirect_parameters(struct gl_context *ctx,
+                               const char *name,
+                               GLintptr drawcount)
+{
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_VALUE is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if <drawcount> is not a multiple of
+    *  four."
+    */
+   if (drawcount & 3) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(drawcount is not a multiple of 4)", name);
+      return GL_FALSE;
+   }
+
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_OPERATION is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if no buffer is bound to the
+    *  PARAMETER_BUFFER_ARB binding point."
+    */
+   if (!_mesa_is_bufferobj(ctx->ParameterBuffer)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s: no buffer bound to PARAMETER_BUFFER", name);
+      return GL_FALSE;
+   }
+
+   if (_mesa_check_disallowed_mapping(ctx->ParameterBuffer)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(PARAMETER_BUFFER is mapped)", name);
+      return GL_FALSE;
+   }
+
+   /* From the ARB_indirect_parameters specification:
+    * "INVALID_OPERATION is generated by MultiDrawArraysIndirectCountARB or
+    *  MultiDrawElementsIndirectCountARB if reading a <sizei> typed value
+    *  from the buffer bound to the PARAMETER_BUFFER_ARB target at the offset
+    *  specified by <drawcount> would result in an out-of-bounds access."
+    */
+   if (ctx->ParameterBuffer->Size < drawcount + sizeof(GLsizei)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(PARAMETER_BUFFER too small)", name);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+GLboolean
+_mesa_validate_MultiDrawArraysIndirectCount(struct gl_context *ctx,
+                                            GLenum mode,
+                                            GLintptr indirect,
+                                            GLintptr drawcount,
+                                            GLsizei maxdrawcount,
+                                            GLsizei stride)
+{
+   GLsizeiptr size = 0;
+   const unsigned drawArraysNumParams = 4;
+
+   FLUSH_CURRENT(ctx, 0);
+
+   /* caller has converted stride==0 to drawArraysNumParams * sizeof(GLuint) */
+   assert(stride != 0);
+
+   if (!valid_draw_indirect_multi(ctx, maxdrawcount, stride,
+                                  "glMultiDrawArraysIndirectCountARB"))
+      return GL_FALSE;
+
+   /* number of bytes of the indirect buffer which will be read */
+   size = maxdrawcount
+      ? (maxdrawcount - 1) * stride + drawArraysNumParams * sizeof(GLuint)
+      : 0;
+
+   if (!valid_draw_indirect(ctx, mode, (void *)indirect, size,
+                            "glMultiDrawArraysIndirectCountARB"))
+      return GL_FALSE;
+
+   return valid_draw_indirect_parameters(
+         ctx, "glMultiDrawArraysIndirectCountARB", drawcount);
+}
+
+GLboolean
+_mesa_validate_MultiDrawElementsIndirectCount(struct gl_context *ctx,
+                                              GLenum mode, GLenum type,
+                                              GLintptr indirect,
+                                              GLintptr drawcount,
+                                              GLsizei maxdrawcount,
+                                              GLsizei stride)
+{
+   GLsizeiptr size = 0;
+   const unsigned drawElementsNumParams = 5;
+
+   FLUSH_CURRENT(ctx, 0);
+
+   /* caller has converted stride==0 to drawElementsNumParams * sizeof(GLuint) */
+   assert(stride != 0);
+
+   if (!valid_draw_indirect_multi(ctx, maxdrawcount, stride,
+                                  "glMultiDrawElementsIndirectCountARB"))
+      return GL_FALSE;
+
+   /* number of bytes of the indirect buffer which will be read */
+   size = maxdrawcount
+      ? (maxdrawcount - 1) * stride + drawElementsNumParams * sizeof(GLuint)
+      : 0;
+
+   if (!valid_draw_indirect_elements(ctx, mode, type,
+                                     (void *)indirect, size,
+                                     "glMultiDrawElementsIndirectCountARB"))
+      return GL_FALSE;
+
+   return valid_draw_indirect_parameters(
+         ctx, "glMultiDrawElementsIndirectCountARB", drawcount);
+}
+
  static bool
  check_valid_to_compute(struct gl_context *ctx, const char *function)
  {
diff --git a/src/mesa/main/api_validate.h b/src/mesa/main/api_validate.h

index 5d030a7..5b321e3 100644 (file)
--- a/src/mesa/main/api_validate.h
+++ b/src/mesa/main/api_validate.h
@@ -106,6 +106,22 @@ _mesa_validate_MultiDrawElementsIndirect(struct gl_context *ctx,
                                           GLsizei stride);
  
  extern GLboolean
+_mesa_validate_MultiDrawArraysIndirectCount(struct gl_context *ctx,
+                                            GLenum mode,
+                                            GLintptr indirect,
+                                            GLintptr drawcount,
+                                            GLsizei maxdrawcount,
+                                            GLsizei stride);
+
+extern GLboolean
+_mesa_validate_MultiDrawElementsIndirectCount(struct gl_context *ctx,
+                                              GLenum mode, GLenum type,
+                                              GLintptr indirect,
+                                              GLintptr drawcount,
+                                              GLsizei maxdrawcount,
+                                              GLsizei stride);
+
+extern GLboolean
  _mesa_validate_DispatchCompute(struct gl_context *ctx,
                                 const GLuint *num_groups);
  
diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c

index 935ba05..8fcbff6 100644 (file)
--- a/src/mesa/main/atifragshader.c
+++ b/src/mesa/main/atifragshader.c
@@ -293,7 +293,7 @@ _mesa_DeleteFragmentShaderATI(GLuint id)
          prog->RefCount--;
          if (prog->RefCount <= 0) {
             assert(prog != &DummyShader);
-           free(prog);
+            _mesa_delete_ati_fragment_shader(ctx, prog);
          }
        }
     }
@@ -345,6 +345,9 @@ _mesa_BeginFragmentShaderATI(void)
     ctx->ATIFragmentShader.Current->isValid = GL_FALSE;
     ctx->ATIFragmentShader.Current->swizzlerq = 0;
     ctx->ATIFragmentShader.Compiling = 1;
+#if MESA_DEBUG_ATI_FS
+   _mesa_debug(ctx, "%s %u\n", __func__, ctx->ATIFragmentShader.Current->Id);
+#endif
  }
  
  void GLAPIENTRY
diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c

index abc5539..5729e60 100644 (file)
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -286,8 +286,17 @@ _mesa_blit_framebuffer(struct gl_context *ctx,
              }
              /* extra checks for multisample copies... */
              if (readFb->Visual.samples > 0 || drawFb->Visual.samples > 0) {
-               /* color formats must match */
-               if (!compatible_resolve_formats(colorReadRb, colorDrawRb)) {
+               /* color formats must match on GLES. This isn't checked on
+                * desktop GL because the GL 4.4 spec was changed to allow it.
+                * In the section entitled “Changes in the released
+                * Specification of July 22, 2013” it says:
+                *
+                * “Relax BlitFramebuffer in section 18.3.1 so that format
+                *  conversion can take place during multisample blits, since
+                *  drivers already allow this and some apps depend on it.”
+                */
+               if (_mesa_is_gles(ctx) &&
+                   !compatible_resolve_formats(colorReadRb, colorDrawRb)) {
                    _mesa_error(ctx, GL_INVALID_OPERATION,
                           "%s(bad src/dst multisample pixel formats)", func);
                    return;
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c

index e0639c8..26f873b 100644 (file)
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -127,6 +127,11 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
           return &ctx->DrawIndirectBuffer;
        }
        break;
+   case GL_PARAMETER_BUFFER_ARB:
+      if (_mesa_has_ARB_indirect_parameters(ctx)) {
+         return &ctx->ParameterBuffer;
+      }
+      break;
     case GL_DISPATCH_INDIRECT_BUFFER:
        if (_mesa_has_compute_shaders(ctx)) {
           return &ctx->DispatchIndirectBuffer;
@@ -447,7 +452,7 @@ _mesa_new_buffer_object(struct gl_context *ctx, GLuint name)
   *
   * Default callback for the \c dd_function_table::DeleteBuffer() hook.
   */
-static void
+void
  _mesa_delete_buffer_object(struct gl_context *ctx,
                             struct gl_buffer_object *bufObj)
  {
@@ -866,6 +871,9 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
     _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer,
                                  ctx->Shared->NullBufferObj);
  
+   _mesa_reference_buffer_object(ctx, &ctx->ParameterBuffer,
+                                ctx->Shared->NullBufferObj);
+
     _mesa_reference_buffer_object(ctx, &ctx->DispatchIndirectBuffer,
                                  ctx->Shared->NullBufferObj);
  
@@ -913,6 +921,8 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
  
     _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer, NULL);
  
+   _mesa_reference_buffer_object(ctx, &ctx->ParameterBuffer, NULL);
+
     _mesa_reference_buffer_object(ctx, &ctx->DispatchIndirectBuffer, NULL);
  
     for (i = 0; i < MAX_COMBINED_UNIFORM_BUFFERS; i++) {
@@ -1205,9 +1215,10 @@ _mesa_BindBuffer(GLenum target, GLuint buffer)
  {
     GET_CURRENT_CONTEXT(ctx);
  
-   if (MESA_VERBOSE & VERBOSE_API)
+   if (MESA_VERBOSE & VERBOSE_API) {
        _mesa_debug(ctx, "glBindBuffer(%s, %u)\n",
                    _mesa_enum_to_string(target), buffer);
+   }
  
     bind_buffer_object(ctx, target, buffer);
  }
@@ -1260,6 +1271,11 @@ _mesa_DeleteBuffers(GLsizei n, const GLuint *ids)
              _mesa_BindBuffer( GL_DRAW_INDIRECT_BUFFER, 0 );
           }
  
+         /* unbind ARB_indirect_parameters binding point */
+         if (ctx->ParameterBuffer == bufObj) {
+            _mesa_BindBuffer(GL_PARAMETER_BUFFER_ARB, 0);
+         }
+
           /* unbind ARB_compute_shader binding point */
           if (ctx->DispatchIndirectBuffer == bufObj) {
              _mesa_BindBuffer(GL_DISPATCH_INDIRECT_BUFFER, 0);
@@ -1562,12 +1578,13 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
  {
     bool valid_usage;
  
-   if (MESA_VERBOSE & VERBOSE_API)
+   if (MESA_VERBOSE & VERBOSE_API) {
        _mesa_debug(ctx, "%s(%s, %ld, %p, %s)\n",
                    func,
                    _mesa_enum_to_string(target),
                    (long int) size, data,
                    _mesa_enum_to_string(usage));
+   }
  
     if (size < 0) {
        _mesa_error(ctx, GL_INVALID_VALUE, "%s(size < 0)", func);
@@ -3112,147 +3129,15 @@ unbind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
  }
  
  static void
-bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
-                          const GLuint *buffers)
-{
-   GLint i;
-
-   if (!error_check_bind_uniform_buffers(ctx, first, count, "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_uniform_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_uniform_buffer_binding *binding =
-          &ctx->UniformBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj) {
-         if (bufObj == ctx->Shared->NullBufferObj)
-            set_ubo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
-         else
-            set_ubo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
-      }
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_shader_storage_buffers_base(struct gl_context *ctx, GLuint first,
-                                 GLsizei count, const GLuint *buffers)
-{
-   GLint i;
-
-   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
-                                                "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_shader_storage_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_shader_storage_buffer_binding *binding =
-          &ctx->ShaderStorageBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj) {
-         if (bufObj == ctx->Shared->NullBufferObj)
-            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
-         else
-            set_ssbo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
-      }
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
-                           const GLuint *buffers,
-                           const GLintptr *offsets, const GLsizeiptr *sizes)
+bind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count,
+                     const GLuint *buffers,
+                     bool range,
+                     const GLintptr *offsets, const GLsizeiptr *sizes,
+                     const char *caller)
  {
     GLint i;
  
-   if (!error_check_bind_uniform_buffers(ctx, first, count,
-                                         "glBindBuffersRange"))
+   if (!error_check_bind_uniform_buffers(ctx, first, count, caller))
        return;
  
     /* Assume that at least one binding will be changed */
@@ -3297,52 +3182,57 @@ bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
        struct gl_uniform_buffer_binding *binding =
           &ctx->UniformBufferBindings[first + i];
        struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
  
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
  
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Uniform buffer array bindings (see sec. 7.6)                  │
-       *      ├─────────────────────┬─────────────────────────────────────────┤
-       *      │  ...                │  ...                                    │
-       *      │  offset restriction │  multiple of value of UNIFORM_BUFFER_-  │
-       *      │                     │  OFFSET_ALIGNMENT                       │
-       *      │  ...                │  ...                                    │
-       *      │  size restriction   │  none                                   │
-       *      └─────────────────────┴─────────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ctx->Const.UniformBufferOffsetAlignment - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of the value of "
-                     "GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT=%u when "
-                     "target=GL_UNIFORM_BUFFER)",
-                     i, (int64_t) offsets[i],
-                     ctx->Const.UniformBufferOffsetAlignment);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Uniform buffer array bindings (see sec. 7.6)                  │
+          *      ├─────────────────────┬─────────────────────────────────────────┤
+          *      │  ...                │  ...                                    │
+          *      │  offset restriction │  multiple of value of UNIFORM_BUFFER_-  │
+          *      │                     │  OFFSET_ALIGNMENT                       │
+          *      │  ...                │  ...                                    │
+          *      │  size restriction   │  none                                   │
+          *      └─────────────────────┴─────────────────────────────────────────┘"
+          */
+         if (offsets[i] & (ctx->Const.UniformBufferOffsetAlignment - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of the value of "
+                        "GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT=%u when "
+                        "target=GL_UNIFORM_BUFFER)",
+                        i, (int64_t) offsets[i],
+                        ctx->Const.UniformBufferOffsetAlignment);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
        }
  
        if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
           bufObj = binding->BufferObject;
        else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
  
        if (bufObj) {
           if (bufObj == ctx->Shared->NullBufferObj)
-            set_ubo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+            set_ubo_binding(ctx, binding, bufObj, -1, -1, !range);
           else
-            set_ubo_binding(ctx, binding, bufObj,
-                            offsets[i], sizes[i], GL_FALSE);
+            set_ubo_binding(ctx, binding, bufObj, offset, size, !range);
        }
     }
  
@@ -3350,15 +3240,16 @@ bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
  }
  
  static void
-bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
-                                  GLsizei count, const GLuint *buffers,
-                                  const GLintptr *offsets,
-                                  const GLsizeiptr *sizes)
+bind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
+                            GLsizei count, const GLuint *buffers,
+                            bool range,
+                            const GLintptr *offsets,
+                            const GLsizeiptr *sizes,
+                            const char *caller)
  {
     GLint i;
  
-   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
-                                                "glBindBuffersRange"))
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count, caller))
        return;
  
     /* Assume that at least one binding will be changed */
@@ -3403,52 +3294,57 @@ bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
        struct gl_shader_storage_buffer_binding *binding =
           &ctx->ShaderStorageBufferBindings[first + i];
        struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
+
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
+
+         /* The ARB_multi_bind spec says:
+         *
+         *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+         *      pair of values in <offsets> and <sizes> does not respectively
+         *      satisfy the constraints described for those parameters for the
+         *      specified target, as described in section 6.7.1 (per binding)."
+         *
+         * Section 6.7.1 refers to table 6.5, which says:
+         *
+         *     "┌───────────────────────────────────────────────────────────────┐
+         *      │ Shader storage buffer array bindings (see sec. 7.8)           │
+         *      ├─────────────────────┬─────────────────────────────────────────┤
+         *      │  ...                │  ...                                    │
+         *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
+         *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
+         *      │  ...                │  ...                                    │
+         *      │  size restriction   │  none                                   │
+         *      └─────────────────────┴─────────────────────────────────────────┘"
+         */
+         if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of the value of "
+                        "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
+                        "target=GL_SHADER_STORAGE_BUFFER)",
+                        i, (int64_t) offsets[i],
+                        ctx->Const.ShaderStorageBufferOffsetAlignment);
+            continue;
+         }
  
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
-
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Shader storage buffer array bindings (see sec. 7.8)           │
-       *      ├─────────────────────┬─────────────────────────────────────────┤
-       *      │  ...                │  ...                                    │
-       *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
-       *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
-       *      │  ...                │  ...                                    │
-       *      │  size restriction   │  none                                   │
-       *      └─────────────────────┴─────────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of the value of "
-                     "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
-                     "target=GL_SHADER_STORAGE_BUFFER)",
-                     i, (int64_t) offsets[i],
-                     ctx->Const.ShaderStorageBufferOffsetAlignment);
-         continue;
+         offset = offsets[i];
+         size = sizes[i];
        }
  
        if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
           bufObj = binding->BufferObject;
        else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
  
        if (bufObj) {
           if (bufObj == ctx->Shared->NullBufferObj)
-            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, !range);
           else
-            set_ssbo_binding(ctx, binding, bufObj,
-                             offsets[i], sizes[i], GL_FALSE);
+            set_ssbo_binding(ctx, binding, bufObj, offset, size, !range);
        }
     }
  
@@ -3520,84 +3416,19 @@ unbind_xfb_buffers(struct gl_context *ctx,
  }
  
  static void
-bind_xfb_buffers_base(struct gl_context *ctx,
-                      GLuint first, GLsizei count,
-                      const GLuint *buffers)
-{
-   struct gl_transform_feedback_object *tfObj =
-      ctx->TransformFeedback.CurrentObject;
-   GLint i;
-
-   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count,
-                                     "glBindBuffersBase"))
-      return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewTransformFeedback;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_xfb_buffers(ctx, tfObj, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_buffer_object * const boundBufObj = tfObj->Buffers[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (boundBufObj && boundBufObj->Name == buffers[i])
-         bufObj = boundBufObj;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj)
-         _mesa_set_transform_feedback_binding(ctx, tfObj, first + i,
-                                              bufObj, 0, 0);
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_xfb_buffers_range(struct gl_context *ctx,
-                       GLuint first, GLsizei count,
-                       const GLuint *buffers,
-                       const GLintptr *offsets,
-                       const GLsizeiptr *sizes)
+bind_xfb_buffers(struct gl_context *ctx,
+                 GLuint first, GLsizei count,
+                 const GLuint *buffers,
+                 bool range,
+                 const GLintptr *offsets,
+                 const GLsizeiptr *sizes,
+                 const char *caller)
  {
     struct gl_transform_feedback_object *tfObj =
         ctx->TransformFeedback.CurrentObject;
     GLint i;
  
-   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count,
-                                     "glBindBuffersRange"))
+   if (!error_check_bind_xfb_buffers(ctx, tfObj, first, count, caller))
        return;
  
     /* Assume that at least one binding will be changed */
@@ -3642,55 +3473,64 @@ bind_xfb_buffers_range(struct gl_context *ctx,
        const GLuint index = first + i;
        struct gl_buffer_object * const boundBufObj = tfObj->Buffers[index];
        struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
  
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         offset = offsets[i];
+         size = sizes[i];
  
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Transform feedback array bindings (see sec. 13.2.2)           │
-       *      ├───────────────────────┬───────────────────────────────────────┤
-       *      │    ...                │    ...                                │
-       *      │    offset restriction │    multiple of 4                      │
-       *      │    ...                │    ...                                │
-       *      │    size restriction   │    multiple of 4                      │
-       *      └───────────────────────┴───────────────────────────────────────┘"
-       */
-      if (offsets[i] & 0x3) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of 4 when "
-                     "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
-                     i, (int64_t) offsets[i]);
-         continue;
-      }
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
  
-      if (sizes[i] & 0x3) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(sizes[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of 4 when "
-                     "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
-                     i, (int64_t) sizes[i]);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Transform feedback array bindings (see sec. 13.2.2)           │
+          *      ├───────────────────────┬───────────────────────────────────────┤
+          *      │    ...                │    ...                                │
+          *      │    offset restriction │    multiple of 4                      │
+          *      │    ...                │    ...                                │
+          *      │    size restriction   │    multiple of 4                      │
+          *      └───────────────────────┴───────────────────────────────────────┘"
+          */
+         if (offsets[i] & 0x3) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of 4 when "
+                        "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
+                        i, (int64_t) offsets[i]);
+            continue;
+         }
+
+         if (sizes[i] & 0x3) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(sizes[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of 4 when "
+                        "target=GL_TRANSFORM_FEEDBACK_BUFFER)",
+                        i, (int64_t) sizes[i]);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
        }
  
        if (boundBufObj && boundBufObj->Name == buffers[i])
           bufObj = boundBufObj;
        else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
  
        if (bufObj)
           _mesa_set_transform_feedback_binding(ctx, tfObj, index, bufObj,
-                                              offsets[i], sizes[i]);
+                                              offset, size);
     }
  
     _mesa_end_bufferobj_lookups(ctx);
@@ -3740,82 +3580,18 @@ unbind_atomic_buffers(struct gl_context *ctx, GLuint first, GLsizei count)
  }
  
  static void
-bind_atomic_buffers_base(struct gl_context *ctx,
-                         GLuint first,
-                         GLsizei count,
-                         const GLuint *buffers)
-{
-   GLint i;
-
-   if (!error_check_bind_atomic_buffers(ctx, first, count,
-                                        "glBindBuffersBase"))
-     return;
-
-   /* Assume that at least one binding will be changed */
-   FLUSH_VERTICES(ctx, 0);
-   ctx->NewDriverState |= ctx->DriverFlags.NewAtomicBuffer;
-
-   if (!buffers) {
-      /* The ARB_multi_bind spec says:
-       *
-       *   "If <buffers> is NULL, all bindings from <first> through
-       *    <first>+<count>-1 are reset to their unbound (zero) state."
-       */
-      unbind_atomic_buffers(ctx, first, count);
-      return;
-   }
-
-   /* Note that the error semantics for multi-bind commands differ from
-    * those of other GL commands.
-    *
-    * The Issues section in the ARB_multi_bind spec says:
-    *
-    *    "(11) Typically, OpenGL specifies that if an error is generated by a
-    *          command, that command has no effect.  This is somewhat
-    *          unfortunate for multi-bind commands, because it would require a
-    *          first pass to scan the entire list of bound objects for errors
-    *          and then a second pass to actually perform the bindings.
-    *          Should we have different error semantics?
-    *
-    *       RESOLVED:  Yes.  In this specification, when the parameters for
-    *       one of the <count> binding points are invalid, that binding point
-    *       is not updated and an error will be generated.  However, other
-    *       binding points in the same command will be updated if their
-    *       parameters are valid and no other error occurs."
-    */
-
-   _mesa_begin_bufferobj_lookups(ctx);
-
-   for (i = 0; i < count; i++) {
-      struct gl_atomic_buffer_binding *binding =
-         &ctx->AtomicBufferBindings[first + i];
-      struct gl_buffer_object *bufObj;
-
-      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
-         bufObj = binding->BufferObject;
-      else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersBase");
-
-      if (bufObj)
-         set_atomic_buffer_binding(ctx, binding, bufObj, 0, 0);
-   }
-
-   _mesa_end_bufferobj_lookups(ctx);
-}
-
-static void
-bind_atomic_buffers_range(struct gl_context *ctx,
-                          GLuint first,
-                          GLsizei count,
-                          const GLuint *buffers,
-                          const GLintptr *offsets,
-                          const GLsizeiptr *sizes)
+bind_atomic_buffers(struct gl_context *ctx,
+                    GLuint first,
+                    GLsizei count,
+                    const GLuint *buffers,
+                    bool range,
+                    const GLintptr *offsets,
+                    const GLsizeiptr *sizes,
+                    const char *caller)
  {
     GLint i;
  
-   if (!error_check_bind_atomic_buffers(ctx, first, count,
-                                        "glBindBuffersRange"))
+   if (!error_check_bind_atomic_buffers(ctx, first, count, caller))
       return;
  
     /* Assume that at least one binding will be changed */
@@ -3860,45 +3636,51 @@ bind_atomic_buffers_range(struct gl_context *ctx,
        struct gl_atomic_buffer_binding *binding =
           &ctx->AtomicBufferBindings[first + i];
        struct gl_buffer_object *bufObj;
+      GLintptr offset = 0;
+      GLsizeiptr size = 0;
  
-      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
-         continue;
+      if (range) {
+         if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+            continue;
  
-      /* The ARB_multi_bind spec says:
-       *
-       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
-       *      pair of values in <offsets> and <sizes> does not respectively
-       *      satisfy the constraints described for those parameters for the
-       *      specified target, as described in section 6.7.1 (per binding)."
-       *
-       * Section 6.7.1 refers to table 6.5, which says:
-       *
-       *     "┌───────────────────────────────────────────────────────────────┐
-       *      │ Atomic counter array bindings (see sec. 7.7.2)                │
-       *      ├───────────────────────┬───────────────────────────────────────┤
-       *      │    ...                │    ...                                │
-       *      │    offset restriction │    multiple of 4                      │
-       *      │    ...                │    ...                                │
-       *      │    size restriction   │    none                               │
-       *      └───────────────────────┴───────────────────────────────────────┘"
-       */
-      if (offsets[i] & (ATOMIC_COUNTER_SIZE - 1)) {
-         _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glBindBuffersRange(offsets[%u]=%" PRId64
-                     " is misaligned; it must be a multiple of %d when "
-                     "target=GL_ATOMIC_COUNTER_BUFFER)",
-                     i, (int64_t) offsets[i], ATOMIC_COUNTER_SIZE);
-         continue;
+         /* The ARB_multi_bind spec says:
+          *
+          *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+          *      pair of values in <offsets> and <sizes> does not respectively
+          *      satisfy the constraints described for those parameters for the
+          *      specified target, as described in section 6.7.1 (per binding)."
+          *
+          * Section 6.7.1 refers to table 6.5, which says:
+          *
+          *     "┌───────────────────────────────────────────────────────────────┐
+          *      │ Atomic counter array bindings (see sec. 7.7.2)                │
+          *      ├───────────────────────┬───────────────────────────────────────┤
+          *      │    ...                │    ...                                │
+          *      │    offset restriction │    multiple of 4                      │
+          *      │    ...                │    ...                                │
+          *      │    size restriction   │    none                               │
+          *      └───────────────────────┴───────────────────────────────────────┘"
+          */
+         if (offsets[i] & (ATOMIC_COUNTER_SIZE - 1)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "glBindBuffersRange(offsets[%u]=%" PRId64
+                        " is misaligned; it must be a multiple of %d when "
+                        "target=GL_ATOMIC_COUNTER_BUFFER)",
+                        i, (int64_t) offsets[i], ATOMIC_COUNTER_SIZE);
+            continue;
+         }
+
+         offset = offsets[i];
+         size = sizes[i];
        }
  
        if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
           bufObj = binding->BufferObject;
        else
-         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
-                                                    "glBindBuffersRange");
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i, caller);
  
        if (bufObj)
-         set_atomic_buffer_binding(ctx, binding, bufObj, offsets[i], sizes[i]);
+         set_atomic_buffer_binding(ctx, binding, bufObj, offset, size);
     }
  
     _mesa_end_bufferobj_lookups(ctx);
@@ -3911,6 +3693,11 @@ _mesa_BindBufferRange(GLenum target, GLuint index,
     GET_CURRENT_CONTEXT(ctx);
     struct gl_buffer_object *bufObj;
  
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBufferRange(%s, %u, %u, %ld, %ld)\n",
+                  _mesa_enum_to_string(target), index, buffer, offset, size);
+   }
+
     if (buffer == 0) {
        bufObj = ctx->Shared->NullBufferObj;
     } else {
@@ -3963,6 +3750,11 @@ _mesa_BindBufferBase(GLenum target, GLuint index, GLuint buffer)
     GET_CURRENT_CONTEXT(ctx);
     struct gl_buffer_object *bufObj;
  
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBufferBase(%s, %u, %u)\n",
+                  _mesa_enum_to_string(target), index, buffer);
+   }
+
     if (buffer == 0) {
        bufObj = ctx->Shared->NullBufferObj;
     } else {
@@ -4033,20 +3825,28 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
  {
     GET_CURRENT_CONTEXT(ctx);
  
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBuffersRange(%s, %u, %d, %p, %p, %p)\n",
+                  _mesa_enum_to_string(target), first, count,
+                  buffers, offsets, sizes);
+   }
+
     switch (target) {
     case GL_TRANSFORM_FEEDBACK_BUFFER:
-      bind_xfb_buffers_range(ctx, first, count, buffers, offsets, sizes);
+      bind_xfb_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                       "glBindBuffersRange");
        return;
     case GL_UNIFORM_BUFFER:
-      bind_uniform_buffers_range(ctx, first, count, buffers, offsets, sizes);
+      bind_uniform_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                           "glBindBuffersRange");
        return;
     case GL_SHADER_STORAGE_BUFFER:
-      bind_shader_storage_buffers_range(ctx, first, count, buffers, offsets,
-                                        sizes);
+      bind_shader_storage_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                                  "glBindBuffersRange");
        return;
     case GL_ATOMIC_COUNTER_BUFFER:
-      bind_atomic_buffers_range(ctx, first, count, buffers,
-                                offsets, sizes);
+      bind_atomic_buffers(ctx, first, count, buffers, true, offsets, sizes,
+                          "glBindBuffersRange");
        return;
     default:
        _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersRange(target=%s)",
@@ -4061,18 +3861,27 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
  {
     GET_CURRENT_CONTEXT(ctx);
  
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glBindBuffersBase(%s, %u, %d, %p)\n",
+                  _mesa_enum_to_string(target), first, count, buffers);
+   }
+
     switch (target) {
     case GL_TRANSFORM_FEEDBACK_BUFFER:
-      bind_xfb_buffers_base(ctx, first, count, buffers);
+      bind_xfb_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                       "glBindBuffersBase");
        return;
     case GL_UNIFORM_BUFFER:
-      bind_uniform_buffers_base(ctx, first, count, buffers);
+      bind_uniform_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                           "glBindBuffersBase");
        return;
     case GL_SHADER_STORAGE_BUFFER:
-      bind_shader_storage_buffers_base(ctx, first, count, buffers);
+      bind_shader_storage_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                                  "glBindBuffersBase");
        return;
     case GL_ATOMIC_COUNTER_BUFFER:
-      bind_atomic_buffers_base(ctx, first, count, buffers);
+      bind_atomic_buffers(ctx, first, count, buffers, false, NULL, NULL,
+                          "glBindBuffersBase");
        return;
     default:
        _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersBase(target=%s)",
@@ -4089,8 +3898,14 @@ _mesa_InvalidateBufferSubData(GLuint buffer, GLintptr offset,
     struct gl_buffer_object *bufObj;
     const GLintptr end = offset + length;
  
+   /* Section 6.5 (Invalidating Buffer Data) of the OpenGL 4.5 (Compatibility
+    * Profile) spec says:
+    *
+    *     "An INVALID_VALUE error is generated if buffer is zero or is not the
+    *     name of an existing buffer object."
+    */
     bufObj = _mesa_lookup_bufferobj(ctx, buffer);
-   if (!bufObj) {
+   if (!bufObj || bufObj == &DummyBufferObject) {
        _mesa_error(ctx, GL_INVALID_VALUE,
                    "glInvalidateBufferSubData(name = 0x%x) invalid object",
                    buffer);
@@ -4103,7 +3918,7 @@ _mesa_InvalidateBufferSubData(GLuint buffer, GLintptr offset,
      *     negative, or if <offset> + <length> is greater than the value of
      *     BUFFER_SIZE."
      */
-   if (end < 0 || end > bufObj->Size) {
+   if (offset < 0 || length < 0 || end > bufObj->Size) {
        _mesa_error(ctx, GL_INVALID_VALUE,
                    "glInvalidateBufferSubData(invalid offset or length)");
        return;
@@ -4124,10 +3939,8 @@ _mesa_InvalidateBufferSubData(GLuint buffer, GLintptr offset,
        return;
     }
  
-   /* We don't actually do anything for this yet.  Just return after
-    * validating the parameters and generating the required errors.
-    */
-   return;
+   if (ctx->Driver.InvalidateBufferSubData)
+      ctx->Driver.InvalidateBufferSubData(ctx, bufObj, offset, length);
  }
  
  void GLAPIENTRY
@@ -4136,8 +3949,14 @@ _mesa_InvalidateBufferData(GLuint buffer)
     GET_CURRENT_CONTEXT(ctx);
     struct gl_buffer_object *bufObj;
  
+   /* Section 6.5 (Invalidating Buffer Data) of the OpenGL 4.5 (Compatibility
+    * Profile) spec says:
+    *
+    *     "An INVALID_VALUE error is generated if buffer is zero or is not the
+    *     name of an existing buffer object."
+    */
     bufObj = _mesa_lookup_bufferobj(ctx, buffer);
-   if (!bufObj) {
+   if (!bufObj || bufObj == &DummyBufferObject) {
        _mesa_error(ctx, GL_INVALID_VALUE,
                    "glInvalidateBufferData(name = 0x%x) invalid object",
                    buffer);
@@ -4158,8 +3977,6 @@ _mesa_InvalidateBufferData(GLuint buffer)
        return;
     }
  
-   /* We don't actually do anything for this yet.  Just return after
-    * validating the parameters and generating the required errors.
-    */
-   return;
+   if (ctx->Driver.InvalidateBufferSubData)
+      ctx->Driver.InvalidateBufferSubData(ctx, bufObj, 0, bufObj->Size);
  }
diff --git a/src/mesa/main/bufferobj.h b/src/mesa/main/bufferobj.h

index 3eac96d..a5bfe88 100644 (file)
--- a/src/mesa/main/bufferobj.h
+++ b/src/mesa/main/bufferobj.h
@@ -109,6 +109,10 @@ _mesa_initialize_buffer_object(struct gl_context *ctx,
                                 GLuint name);
  
  extern void
+_mesa_delete_buffer_object(struct gl_context *ctx,
+                           struct gl_buffer_object *bufObj);
+
+extern void
  _mesa_reference_buffer_object_(struct gl_context *ctx,
                                 struct gl_buffer_object **ptr,
                                 struct gl_buffer_object *bufObj);
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h

index e5281ce..70ed563 100644 (file)
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -634,6 +634,11 @@ struct dd_function_table {
                                GLintptr readOffset, GLintptr writeOffset,
                                GLsizeiptr size );
  
+   void (*InvalidateBufferSubData)( struct gl_context *ctx,
+                                    struct gl_buffer_object *obj,
+                                    GLintptr offset,
+                                    GLsizeiptr length );
+
     /* Returns pointer to the start of the mapped range.
      * May return NULL if MESA_MAP_NOWAIT_BIT is set in access:
      */
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h

index 52a4ed6..aeccb01 100644 (file)
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -70,6 +70,7 @@ EXT(ARB_gpu_shader5                         , ARB_gpu_shader5
  EXT(ARB_gpu_shader_fp64                     , ARB_gpu_shader_fp64                    ,  x , GLC,  x ,  x , 2010)
  EXT(ARB_half_float_pixel                    , dummy_true                             , GLL, GLC,  x ,  x , 2003)
  EXT(ARB_half_float_vertex                   , ARB_half_float_vertex                  , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_indirect_parameters                 , ARB_indirect_parameters                ,  x , GLC,  x ,  x , 2013)
  EXT(ARB_instanced_arrays                    , ARB_instanced_arrays                   , GLL, GLC,  x ,  x , 2008)
  EXT(ARB_internalformat_query                , ARB_internalformat_query               , GLL, GLC,  x ,  x , 2011)
  EXT(ARB_invalidate_subdata                  , dummy_true                             , GLL, GLC,  x ,  x , 2012)
@@ -96,6 +97,7 @@ EXT(ARB_separate_shader_objects             , dummy_true
  EXT(ARB_shader_atomic_counters              , ARB_shader_atomic_counters             , GLL, GLC,  x ,  x , 2011)
  EXT(ARB_shader_bit_encoding                 , ARB_shader_bit_encoding                , GLL, GLC,  x ,  x , 2010)
  EXT(ARB_shader_clock                        , ARB_shader_clock                       , GLL, GLC,  x ,  x , 2015)
+EXT(ARB_shader_draw_parameters              , ARB_shader_draw_parameters             , GLL, GLC,  x ,  x , 2013)
  EXT(ARB_shader_image_load_store             , ARB_shader_image_load_store            , GLL, GLC,  x ,  x , 2011)
  EXT(ARB_shader_image_size                   , ARB_shader_image_size                  , GLL, GLC,  x ,  x , 2012)
  EXT(ARB_shader_objects                      , dummy_true                             , GLL, GLC,  x ,  x , 2002)
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c

index fe6bdc2..3be216d 100644 (file)
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1253,23 +1253,22 @@ _mesa_test_framebuffer_completeness(struct gl_context *ctx,
        ctx->Driver.ValidateFramebuffer(ctx, fb);
        if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
           fbo_incomplete(ctx, "driver marked FBO as incomplete", -1);
+         return;
        }
     }
  
-   if (fb->_Status == GL_FRAMEBUFFER_COMPLETE_EXT) {
-      /*
-       * Note that if ARB_framebuffer_object is supported and the attached
-       * renderbuffers/textures are different sizes, the framebuffer
-       * width/height will be set to the smallest width/height.
-       */
-      if (numImages != 0) {
-         fb->Width = minWidth;
-         fb->Height = minHeight;
-      }
-
-      /* finally, update the visual info for the framebuffer */
-      _mesa_update_framebuffer_visual(ctx, fb);
+   /*
+    * Note that if ARB_framebuffer_object is supported and the attached
+    * renderbuffers/textures are different sizes, the framebuffer
+    * width/height will be set to the smallest width/height.
+    */
+   if (numImages != 0) {
+      fb->Width = minWidth;
+      fb->Height = minHeight;
     }
+
+   /* finally, update the visual info for the framebuffer */
+   _mesa_update_framebuffer_visual(ctx, fb);
  }
  
  
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c

index c6a2e5b..95cb18c 100644 (file)
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -423,6 +423,7 @@ EXTRA_EXT(ARB_framebuffer_no_attachments);
  EXTRA_EXT(ARB_tessellation_shader);
  EXTRA_EXT(ARB_shader_subroutine);
  EXTRA_EXT(ARB_shader_storage_buffer_object);
+EXTRA_EXT(ARB_indirect_parameters);
  
  static const int
  extra_ARB_color_buffer_float_or_glcore[] = {
@@ -1032,6 +1033,10 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
     case GL_DRAW_INDIRECT_BUFFER_BINDING:
        v->value_int = ctx->DrawIndirectBuffer->Name;
        break;
+   /* GL_ARB_indirect_parameters */
+   case GL_PARAMETER_BUFFER_BINDING_ARB:
+      v->value_int = ctx->ParameterBuffer->Name;
+      break;
     /* GL_ARB_separate_shader_objects */
     case GL_PROGRAM_PIPELINE_BINDING:
        if (ctx->Pipeline.Current) {
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py

index 7a48ed2..af7a8f4 100644 (file)
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -887,6 +887,10 @@ descriptor=[
  # GL_ARB_shader_subroutine
    [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), extra_ARB_shader_subroutine" ],
    [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), extra_ARB_shader_subroutine" ],
+
+# GL_ARB_indirect_parameters
+  [ "PARAMETER_BUFFER_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_indirect_parameters" ],
+
  ]}
  
  ]
diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h

index 042147f..ad7af5c 100644 (file)
--- a/src/mesa/main/imports.h
+++ b/src/mesa/main/imports.h
@@ -151,6 +151,13 @@ static inline int IROUND(float f)
     return (int) ((f >= 0.0F) ? (f + 0.5F) : (f - 0.5F));
  }
  
+/**
+ * Convert double to int by rounding to nearest integer, away from zero.
+ */
+static inline int IROUNDD(double d)
+{
+   return (int) ((d >= 0.0) ? (d + 0.5) : (d - 0.5));
+}
  
  /**
   * Convert float to int64 by rounding to nearest integer.
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h

index 48309bf..1c717fe 100644 (file)
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2155,8 +2155,6 @@ struct gl_compute_program_state
  /**
   * ATI_fragment_shader runtime state
   */
-#define ATI_FS_INPUT_PRIMARY 0
-#define ATI_FS_INPUT_SECONDARY 1
  
  struct atifs_instruction;
  struct atifs_setupinst;
@@ -2487,6 +2485,11 @@ struct gl_uniform_block
     GLuint Binding;
  
     /**
+    * Vulkan descriptor set qualifier for this block.
+    */
+   GLuint Set;
+
+   /**
      * Minimum size (in bytes) of a buffer object to back this uniform buffer
      * (GL_UNIFORM_BLOCK_DATA_SIZE).
      */
@@ -2527,6 +2530,67 @@ struct gl_active_atomic_buffer
  };
  
  /**
+ * Data container for shader queries. This holds only the minimal
+ * amount of required information for resource queries to work.
+ */
+struct gl_shader_variable
+{
+   /**
+    * Declared type of the variable
+    */
+   const struct glsl_type *type;
+
+   /**
+    * Declared name of the variable
+    */
+   char *name;
+
+   /**
+    * Storage location of the base of this variable
+    *
+    * The precise meaning of this field depends on the nature of the variable.
+    *
+    *   - Vertex shader input: one of the values from \c gl_vert_attrib.
+    *   - Vertex shader output: one of the values from \c gl_varying_slot.
+    *   - Geometry shader input: one of the values from \c gl_varying_slot.
+    *   - Geometry shader output: one of the values from \c gl_varying_slot.
+    *   - Fragment shader input: one of the values from \c gl_varying_slot.
+    *   - Fragment shader output: one of the values from \c gl_frag_result.
+    *   - Uniforms: Per-stage uniform slot number for default uniform block.
+    *   - Uniforms: Index within the uniform block definition for UBO members.
+    *   - Non-UBO Uniforms: explicit location until linking then reused to
+    *     store uniform slot number.
+    *   - Other: This field is not currently used.
+    *
+    * If the variable is a uniform, shader input, or shader output, and the
+    * slot has not been assigned, the value will be -1.
+    */
+   int location;
+
+   /**
+    * Output index for dual source blending.
+    *
+    * \note
+    * The GLSL spec only allows the values 0 or 1 for the index in \b dual
+    * source blending.
+    */
+   unsigned index:1;
+
+   /**
+    * Specifies whether a shader input/output is per-patch in tessellation
+    * shader stages.
+    */
+   unsigned patch:1;
+
+   /**
+    * Storage class of the variable.
+    *
+    * \sa (n)ir_variable_mode
+    */
+   unsigned mode:4;
+};
+
+/**
   * Active resource in a gl_shader_program
   */
  struct gl_program_resource
@@ -2738,6 +2802,13 @@ struct gl_shader_program
     int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
  
     /**
+    * Indices into the BufferInterfaceBlocks[] array for Uniform Buffer
+    * Objects and Shader Storage Buffer Objects.
+    */
+   unsigned *UboInterfaceBlockIndex;
+   unsigned *SsboInterfaceBlockIndex;
+
+   /**
      * Map of active uniform names to locations
      *
      * Maps any active uniform that is not an array element to a location.
@@ -3517,6 +3588,10 @@ struct gl_constants
      */
     GLboolean GLSLSkipStrictMaxUniformLimitCheck;
  
+   /** Whether gl_FragCoord and gl_FrontFacing are system values. */
+   bool GLSLFragCoordIsSysVal;
+   bool GLSLFrontFacingIsSysVal;
+
     /**
      * Always use the GetTransformFeedbackVertexCount() driver hook, rather
      * than passing the transform feedback object to the drawing function.
@@ -3702,6 +3777,7 @@ struct gl_extensions
     GLboolean ARB_gpu_shader5;
     GLboolean ARB_gpu_shader_fp64;
     GLboolean ARB_half_float_vertex;
+   GLboolean ARB_indirect_parameters;
     GLboolean ARB_instanced_arrays;
     GLboolean ARB_internalformat_query;
     GLboolean ARB_map_buffer_range;
@@ -3714,6 +3790,7 @@ struct gl_extensions
     GLboolean ARB_shader_atomic_counters;
     GLboolean ARB_shader_bit_encoding;
     GLboolean ARB_shader_clock;
+   GLboolean ARB_shader_draw_parameters;
     GLboolean ARB_shader_image_load_store;
     GLboolean ARB_shader_image_size;
     GLboolean ARB_shader_precision;
@@ -4349,6 +4426,7 @@ struct gl_context
     struct gl_perf_monitor_state PerfMonitor;
  
     struct gl_buffer_object *DrawIndirectBuffer; /** < GL_ARB_draw_indirect */
+   struct gl_buffer_object *ParameterBuffer; /** < GL_ARB_indirect_parameters */
     struct gl_buffer_object *DispatchIndirectBuffer; /** < GL_ARB_compute_shader */
  
     struct gl_buffer_object *CopyReadBuffer; /**< GL_ARB_copy_buffer */
@@ -4537,11 +4615,22 @@ enum _debug
     DEBUG_INCOMPLETE_FBO         = (1 << 3)
  };
  
+/**
+ * Checks if the active fragment shader program can have side effects due
+ * to use of things like atomic buffers or images
+ */
  static inline bool
-_mesa_active_fragment_shader_has_atomic_ops(const struct gl_context *ctx)
+_mesa_active_fragment_shader_has_side_effects(const struct gl_context *ctx)
  {
-   return ctx->Shader._CurrentFragmentProgram != NULL &&
-      ctx->Shader._CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]->NumAtomicBuffers > 0;
+   const struct gl_shader *sh;
+
+   if (!ctx->_Shader->_CurrentFragmentProgram)
+      return false;
+
+   sh = ctx->_Shader->_CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT];
+   return sh->NumAtomicBuffers > 0 ||
+          sh->NumImages > 0 ||
+          sh->NumShaderStorageBlocks > 0;
  }
  
  #ifdef __cplusplus
diff --git a/src/mesa/main/performance_monitor.c b/src/mesa/main/performance_monitor.c

index 98dfbea..43529b2 100644 (file)
--- a/src/mesa/main/performance_monitor.c
+++ b/src/mesa/main/performance_monitor.c
@@ -591,11 +591,10 @@ perf_monitor_result_size(const struct gl_context *ctx,
  
     for (group = 0; group < ctx->PerfMonitor.NumGroups; group++) {
        const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[group];
-      for (counter = 0; counter < g->NumCounters; counter++) {
-         const struct gl_perf_monitor_counter *c = &g->Counters[counter];
+      BITSET_WORD tmp;
  
-         if (!BITSET_TEST(m->ActiveCounters[group], counter))
-            continue;
+      BITSET_FOREACH_SET(counter, tmp, m->ActiveCounters[group], g->NumCounters) {
+         const struct gl_perf_monitor_counter *c = &g->Counters[counter];
  
           size += sizeof(uint32_t); /* Group ID */
           size += sizeof(uint32_t); /* Counter ID */
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c

index b7e25fe..9a15cfe 100644 (file)
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -70,6 +70,13 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
                              GLenum pname, GLint *params)
  {
     GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramInterfaceiv(%u, %s, %s, %p)\n",
+                  program, _mesa_enum_to_string(programInterface),
+                  _mesa_enum_to_string(pname), params);
+   }
+
     unsigned i;
     struct gl_shader_program *shProg =
        _mesa_lookup_shader_program_err(ctx, program,
@@ -226,6 +233,12 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
                                const GLchar *name)
  {
     GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceIndex(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
     unsigned array_index = 0;
     struct gl_program_resource *res;
     struct gl_shader_program *shProg =
@@ -290,6 +303,13 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
                               GLchar *name)
  {
     GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceName(%u, %s, %u, %d, %p, %p)\n",
+                  program, _mesa_enum_to_string(programInterface), index,
+                  bufSize, length, name);
+   }
+
     struct gl_shader_program *shProg =
        _mesa_lookup_shader_program_err(ctx, program,
                                        "glGetProgramResourceName");
@@ -315,6 +335,13 @@ _mesa_GetProgramResourceiv(GLuint program, GLenum programInterface,
                             GLsizei *length, GLint *params)
  {
     GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceiv(%u, %s, %u, %d, %p, %d, %p, %p)\n",
+                  program, _mesa_enum_to_string(programInterface), index,
+                  propCount, props, bufSize, length, params);
+   }
+
     struct gl_shader_program *shProg =
        _mesa_lookup_shader_program_err(ctx, program, "glGetProgramResourceiv");
  
@@ -361,6 +388,12 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
                                   const GLchar *name)
  {
     GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceLocation(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
     struct gl_shader_program *shProg =
        lookup_linked_program(program, "glGetProgramResourceLocation");
  
@@ -411,6 +444,12 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
                                        const GLchar *name)
  {
     GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API) {
+      _mesa_debug(ctx, "glGetProgramResourceLocationIndex(%u, %s, %s)\n",
+                  program, _mesa_enum_to_string(programInterface), name);
+   }
+
     struct gl_shader_program *shProg =
        lookup_linked_program(program, "glGetProgramResourceLocationIndex");
  
diff --git a/src/mesa/main/samplerobj.c b/src/mesa/main/samplerobj.c

index 676dd36..fe15508 100644 (file)
--- a/src/mesa/main/samplerobj.c
+++ b/src/mesa/main/samplerobj.c
@@ -270,6 +270,17 @@ _mesa_IsSampler(GLuint sampler)
     return sampObj != NULL;
  }
  
+void
+_mesa_bind_sampler(struct gl_context *ctx, GLuint unit,
+                   struct gl_sampler_object *sampObj)
+{
+   if (ctx->Texture.Unit[unit].Sampler != sampObj) {
+      FLUSH_VERTICES(ctx, _NEW_TEXTURE);
+   }
+
+   _mesa_reference_sampler_object(ctx, &ctx->Texture.Unit[unit].Sampler,
+                                  sampObj);
+}
  
  void GLAPIENTRY
  _mesa_BindSampler(GLuint unit, GLuint sampler)
@@ -297,13 +308,8 @@ _mesa_BindSampler(GLuint unit, GLuint sampler)
        }
     }
     
-   if (ctx->Texture.Unit[unit].Sampler != sampObj) {
-      FLUSH_VERTICES(ctx, _NEW_TEXTURE);
-   }
-
     /* bind new sampler */
-   _mesa_reference_sampler_object(ctx, &ctx->Texture.Unit[unit].Sampler,
-                                  sampObj);
+   _mesa_bind_sampler(ctx, unit, sampObj);
  }
  
  
@@ -444,6 +450,22 @@ flush(struct gl_context *ctx)
     FLUSH_VERTICES(ctx, _NEW_TEXTURE);
  }
  
+void
+_mesa_set_sampler_wrap(struct gl_context *ctx, struct gl_sampler_object *samp,
+                       GLenum s, GLenum t, GLenum r)
+{
+   assert(validate_texture_wrap_mode(ctx, s));
+   assert(validate_texture_wrap_mode(ctx, t));
+   assert(validate_texture_wrap_mode(ctx, r));
+
+   if (samp->WrapS == s && samp->WrapT == t && samp->WrapR == r)
+      return;
+
+   flush(ctx);
+   samp->WrapS = s;
+   samp->WrapT = t;
+   samp->WrapR = r;
+}
  
  #define INVALID_PARAM 0x100
  #define INVALID_PNAME 0x101
@@ -493,6 +515,27 @@ set_sampler_wrap_r(struct gl_context *ctx, struct gl_sampler_object *samp,
     return INVALID_PARAM;
  }
  
+void
+_mesa_set_sampler_filters(struct gl_context *ctx,
+                          struct gl_sampler_object *samp,
+                          GLenum min_filter, GLenum mag_filter)
+{
+   assert(min_filter == GL_NEAREST ||
+          min_filter == GL_LINEAR ||
+          min_filter == GL_NEAREST_MIPMAP_NEAREST ||
+          min_filter == GL_LINEAR_MIPMAP_NEAREST ||
+          min_filter == GL_NEAREST_MIPMAP_LINEAR ||
+          min_filter == GL_LINEAR_MIPMAP_LINEAR);
+   assert(mag_filter == GL_NEAREST ||
+          mag_filter == GL_LINEAR);
+
+   if (samp->MinFilter == min_filter && samp->MagFilter == mag_filter)
+      return;
+
+   flush(ctx);
+   samp->MinFilter = min_filter;
+   samp->MagFilter = mag_filter;
+}
  
  static GLuint
  set_sampler_min_filter(struct gl_context *ctx, struct gl_sampler_object *samp,
@@ -713,6 +756,16 @@ set_sampler_cube_map_seamless(struct gl_context *ctx,
     return GL_TRUE;
  }
  
+void
+_mesa_set_sampler_srgb_decode(struct gl_context *ctx,
+                              struct gl_sampler_object *samp, GLenum param)
+{
+   assert(param == GL_DECODE_EXT || param == GL_SKIP_DECODE_EXT);
+
+   flush(ctx);
+   samp->sRGBDecode = param;
+}
+
  static GLuint
  set_sampler_srgb_decode(struct gl_context *ctx,
                                struct gl_sampler_object *samp, GLenum param)
diff --git a/src/mesa/main/samplerobj.h b/src/mesa/main/samplerobj.h

index 7bea911..abc6e01 100644 (file)
--- a/src/mesa/main/samplerobj.h
+++ b/src/mesa/main/samplerobj.h
@@ -80,6 +80,23 @@ _mesa_new_sampler_object(struct gl_context *ctx, GLuint name);
  extern void
  _mesa_init_sampler_object_functions(struct dd_function_table *driver);
  
+extern void
+_mesa_set_sampler_wrap(struct gl_context *ctx, struct gl_sampler_object *samp,
+                       GLenum s, GLenum t, GLenum r);
+
+extern void
+_mesa_set_sampler_filters(struct gl_context *ctx,
+                          struct gl_sampler_object *samp,
+                          GLenum min_filter, GLenum mag_filter);
+
+extern void
+_mesa_set_sampler_srgb_decode(struct gl_context *ctx,
+                              struct gl_sampler_object *samp, GLenum param);
+
+extern void
+_mesa_bind_sampler(struct gl_context *ctx, GLuint unit,
+                   struct gl_sampler_object *sampObj);
+
  void GLAPIENTRY
  _mesa_GenSamplers(GLsizei count, GLuint *samplers);
  void GLAPIENTRY
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp

index ced10a9..a18b860 100644 (file)
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -56,7 +56,7 @@ const type * RESOURCE_ ## name (gl_program_resource *res) { \
     return (type *) res->Data; \
  }
  
-DECL_RESOURCE_FUNC(VAR, ir_variable);
+DECL_RESOURCE_FUNC(VAR, gl_shader_variable);
  DECL_RESOURCE_FUNC(UBO, gl_uniform_block);
  DECL_RESOURCE_FUNC(UNI, gl_uniform_storage);
  DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer);
@@ -101,14 +101,14 @@ _mesa_BindAttribLocation(GLhandleARB program, GLuint index,
  }
  
  static bool
-is_active_attrib(const ir_variable *var)
+is_active_attrib(const gl_shader_variable *var)
  {
     if (!var)
        return false;
  
-   switch (var->data.mode) {
+   switch (var->mode) {
     case ir_var_shader_in:
-      return var->data.location != -1;
+      return var->location != -1;
  
     case ir_var_system_value:
        /* From GL 4.3 core spec, section 11.1.1 (Vertex Attributes):
@@ -116,9 +116,9 @@ is_active_attrib(const ir_variable *var)
         * are enumerated, including the special built-in inputs gl_VertexID
         * and gl_InstanceID."
         */
-      return var->data.location == SYSTEM_VALUE_VERTEX_ID ||
-             var->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE ||
-             var->data.location == SYSTEM_VALUE_INSTANCE_ID;
+      return var->location == SYSTEM_VALUE_VERTEX_ID ||
+             var->location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE ||
+             var->location == SYSTEM_VALUE_INSTANCE_ID;
  
     default:
        return false;
@@ -163,7 +163,7 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
        return;
     }
  
-   const ir_variable *const var = RESOURCE_VAR(res);
+   const gl_shader_variable *const var = RESOURCE_VAR(res);
  
     if (!is_active_attrib(var))
        return;
@@ -174,8 +174,8 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
      * consider gl_VertexIDMESA as gl_VertexID for purposes of checking
      * active attributes.
      */
-   if (var->data.mode == ir_var_system_value &&
-       var->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) {
+   if (var->mode == ir_var_system_value &&
+       var->location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) {
        var_name = "gl_VertexID";
     }
  
@@ -427,7 +427,7 @@ _mesa_GetFragDataLocation(GLuint program, const GLchar *name)
  const char*
  _mesa_program_resource_name(struct gl_program_resource *res)
  {
-   const ir_variable *var;
+   const gl_shader_variable *var;
     switch (res->Type) {
     case GL_UNIFORM_BLOCK:
     case GL_SHADER_STORAGE_BLOCK:
@@ -437,8 +437,8 @@ _mesa_program_resource_name(struct gl_program_resource *res)
     case GL_PROGRAM_INPUT:
        var = RESOURCE_VAR(res);
        /* Special case gl_VertexIDMESA -> gl_VertexID. */
-      if (var->data.mode == ir_var_system_value &&
-          var->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) {
+      if (var->mode == ir_var_system_value &&
+          var->location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) {
           return "gl_VertexID";
        }
     /* fallthrough */
@@ -764,8 +764,12 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
  static bool
  add_index_to_name(struct gl_program_resource *res)
  {
-   bool add_index = !(((res->Type == GL_PROGRAM_INPUT) &&
-                       res->StageReferences & (1 << MESA_SHADER_GEOMETRY)));
+   bool add_index = !((res->Type == GL_PROGRAM_INPUT &&
+                       res->StageReferences & (1 << MESA_SHADER_GEOMETRY |
+                                               1 << MESA_SHADER_TESS_CTRL |
+                                               1 << MESA_SHADER_TESS_EVAL)) ||
+                      (res->Type == GL_PROGRAM_OUTPUT &&
+                       res->StageReferences & 1 << MESA_SHADER_TESS_CTRL));
  
     /* Transform feedback varyings have array index already appended
      * in their names.
@@ -853,14 +857,14 @@ program_resource_location(struct gl_shader_program *shProg,
      */
     switch (res->Type) {
     case GL_PROGRAM_INPUT: {
-      const ir_variable *var = RESOURCE_VAR(res);
+      const gl_shader_variable *var = RESOURCE_VAR(res);
  
        /* If the input is an array, fail if the index is out of bounds. */
        if (array_index > 0
            && array_index >= var->type->length) {
           return -1;
        }
-      return (var->data.location +
+      return (var->location +
               (array_index * var->type->without_array()->matrix_columns) -
               VERT_ATTRIB_GENERIC0);
     }
@@ -870,7 +874,7 @@ program_resource_location(struct gl_shader_program *shProg,
            && array_index >= RESOURCE_VAR(res)->type->length) {
           return -1;
        }
-      return RESOURCE_VAR(res)->data.location + array_index - FRAG_RESULT_DATA0;
+      return RESOURCE_VAR(res)->location + array_index - FRAG_RESULT_DATA0;
     case GL_UNIFORM:
        /* If the uniform is built-in, fail. */
        if (RESOURCE_UNI(res)->builtin)
@@ -950,7 +954,7 @@ _mesa_program_resource_location_index(struct gl_shader_program *shProg,
     if (!res || !(res->StageReferences & (1 << MESA_SHADER_FRAGMENT)))
        return -1;
  
-   return RESOURCE_VAR(res)->data.index;
+   return RESOURCE_VAR(res)->index;
  }
  
  static uint8_t
@@ -1248,7 +1252,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
     case GL_LOCATION_INDEX:
        if (res->Type != GL_PROGRAM_OUTPUT)
           goto invalid_operation;
-      *val = RESOURCE_VAR(res)->data.index;
+      *val = RESOURCE_VAR(res)->index;
        return 1;
  
     case GL_NUM_COMPATIBLE_SUBROUTINES:
@@ -1305,7 +1309,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
        switch (res->Type) {
        case GL_PROGRAM_INPUT:
        case GL_PROGRAM_OUTPUT:
-         *val = RESOURCE_VAR(res)->data.patch;
+         *val = RESOURCE_VAR(res)->patch;
           return 1;
        default:
           goto invalid_operation;
@@ -1373,46 +1377,107 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
  }
  
  static bool
-validate_io(const struct gl_shader *input_stage,
-            const struct gl_shader *output_stage, bool isES)
+validate_io(const struct gl_shader *producer,
+            const struct gl_shader *consumer, bool isES)
  {
-   assert(input_stage && output_stage);
+   assert(producer && consumer);
+   unsigned inputs = 0, outputs = 0;
+
+   /* From OpenGL ES 3.1 spec (Interface matching):
+    *
+    *    "An output variable is considered to match an input variable in the
+    *    subsequent shader if:
+    *
+    *    - the two variables match in name, type, and qualification; or
+    *    - the two variables are declared with the same location qualifier and
+    *      match in type and qualification.
+    *
+    *    ...
+    *
+    *    At an interface between program objects, the set of inputs and outputs
+    *    are considered to match exactly if and only if:
+    *
+    *    - Every declared input variable has a matching output, as described
+    *    above.
+    *
+    *    - There are no user-defined output variables declared without a
+    *    matching input variable declaration.
+    *
+    *    - All matched input and output variables have identical precision
+    *    qualification.
+    *
+    *    When the set of inputs and outputs on an interface between programs
+    *    matches exactly, all inputs are well-defined except when the
+    *    corresponding outputs were not written in the previous shader. However,
+    *    any mismatch between inputs and outputs will result in a validation
+    *    failure."
+    *
+    * OpenGL Core 4.5 spec includes same paragraph as above but without check
+    * for precision and the last 'validation failure' clause. Therefore
+    * behaviour is more relaxed, input and output amount is not required by the
+    * spec to be validated.
+    *
+    * FIXME: Update once Khronos spec bug #15331 is resolved.
+    * FIXME: Add validation by type, currently information loss during varying
+    * packing makes this challenging.
+    */
+
+   /* Currently no matching done for desktop. */
+   if (!isES)
+      return true;
  
     /* For each output in a, find input in b and do any required checks. */
-   foreach_in_list(ir_instruction, out, input_stage->ir) {
+   foreach_in_list(ir_instruction, out, producer->ir) {
        ir_variable *out_var = out->as_variable();
-      if (!out_var || out_var->data.mode != ir_var_shader_out)
+      if (!out_var || out_var->data.mode != ir_var_shader_out ||
+          is_gl_identifier(out_var->name))
           continue;
  
-      foreach_in_list(ir_instruction, in, output_stage->ir) {
+      outputs++;
+
+      inputs = 0;
+      foreach_in_list(ir_instruction, in, consumer->ir) {
           ir_variable *in_var = in->as_variable();
-         if (!in_var || in_var->data.mode != ir_var_shader_in)
+         if (!in_var || in_var->data.mode != ir_var_shader_in ||
+             is_gl_identifier(in_var->name))
+            continue;
+
+         inputs++;
+
+         /* Match by location qualifier and precision.
+          *
+          * FIXME: Add explicit location matching validation here. Be careful
+          * not to match varyings with explicit locations to varyings without
+          * explicit locations.
+          */
+         if ((in_var->data.explicit_location &&
+             out_var->data.explicit_location) &&
+             in_var->data.location == out_var->data.location &&
+             in_var->data.precision == out_var->data.precision)
              continue;
  
-         if (strcmp(in_var->name, out_var->name) == 0) {
-            /* Since we now only validate precision, we can skip this step for
-             * desktop GLSL shaders, there precision qualifier is ignored.
-             *
-             * From OpenGL 4.50 Shading Language spec, section 4.7:
-             *     "For the purposes of determining if an output from one
-             *     shader stage matches an input of the next stage, the
-             *     precision qualifier need not match."
+         unsigned len = strlen(in_var->name);
+
+         /* Handle input swizzle in variable name. */
+         const char *dot = strchr(in_var->name, '.');
+         if (dot)
+            len = dot - in_var->name;
+
+         /* Match by name and precision. */
+         if (strncmp(in_var->name, out_var->name, len) == 0) {
+            /* From OpenGL ES 3.1 spec:
+             *     "When both shaders are in separate programs, mismatched
+             *     precision qualifiers will result in a program interface
+             *     mismatch that will result in program pipeline validation
+             *     failures, as described in section 7.4.1 (“Shader Interface
+             *     Matching”) of the OpenGL ES 3.1 Specification."
               */
-            if (isES) {
-               /* From OpenGL ES 3.1 spec:
-                *     "When both shaders are in separate programs, mismatched
-                *     precision qualifiers will result in a program interface
-                *     mismatch that will result in program pipeline validation
-                *     failures, as described in section 7.4.1 (“Shader Interface
-                *     Matching”) of the OpenGL ES 3.1 Specification."
-                */
-               if (in_var->data.precision != out_var->data.precision)
-                  return false;
-            }
+            if (in_var->data.precision != out_var->data.precision)
+               return false;
           }
        }
     }
-   return true;
+   return inputs == outputs;
  }
  
  /**
@@ -1435,6 +1500,13 @@ _mesa_validate_pipeline_io(struct gl_pipeline_object *pipeline)
  
     for (idx = prev + 1; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) {
        if (shProg[idx]) {
+         /* Pipeline might include both non-compute and a compute program, do
+          * not attempt to validate varyings between non-compute and compute
+          * stage.
+          */
+         if (shProg[idx]->_LinkedShaders[idx]->Stage == MESA_SHADER_COMPUTE)
+            break;
+
           if (!validate_io(shProg[prev]->_LinkedShaders[prev],
                            shProg[idx]->_LinkedShaders[idx],
                            shProg[prev]->IsES || shProg[idx]->IsES))
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c

index ac40891..cdc85f3 100644 (file)
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -208,7 +208,7 @@ _mesa_validate_shader_target(const struct gl_context *ctx, GLenum type)
     case GL_TESS_EVALUATION_SHADER:
        return ctx == NULL || _mesa_has_tessellation(ctx);
     case GL_COMPUTE_SHADER:
-      return ctx == NULL || ctx->Extensions.ARB_compute_shader;
+      return ctx == NULL || _mesa_has_compute_shaders(ctx);
     default:
        return false;
     }
@@ -300,7 +300,8 @@ create_shader(struct gl_context *ctx, GLenum type)
     GLuint name;
  
     if (!_mesa_validate_shader_target(ctx, type)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "CreateShader(type)");
+      _mesa_error(ctx, GL_INVALID_ENUM, "CreateShader(%s)",
+                  _mesa_enum_to_string(type));
        return 0;
     }
  
@@ -1514,6 +1515,8 @@ void GLAPIENTRY
  _mesa_LinkProgram(GLhandleARB programObj)
  {
     GET_CURRENT_CONTEXT(ctx);
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glLinkProgram %u\n", programObj);
     link_program(ctx, programObj);
  }
  
@@ -1731,6 +1734,9 @@ _mesa_UseProgram(GLhandleARB program)
     GET_CURRENT_CONTEXT(ctx);
     struct gl_shader_program *shProg;
  
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glUseProgram %u\n", program);
+
     if (_mesa_is_xfb_active_and_unpaused(ctx)) {
        _mesa_error(ctx, GL_INVALID_OPERATION,
                    "glUseProgram(transform feedback active)");
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c

index c4ebf42..040e9fd 100644 (file)
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -738,8 +738,10 @@ _mesa_MemoryBarrierByRegion(GLbitfield barriers)
         * That is, if barriers is the special value GL_ALL_BARRIER_BITS, then all
         * barriers allowed by glMemoryBarrierByRegion should be activated."
         */
-      if (barriers == GL_ALL_BARRIER_BITS)
-         return ctx->Driver.MemoryBarrier(ctx, all_allowed_bits);
+      if (barriers == GL_ALL_BARRIER_BITS) {
+         ctx->Driver.MemoryBarrier(ctx, all_allowed_bits);
+         return;
+      }
  
        /* From section 7.11.2 of the OpenGL ES 3.1 specification:
         *
diff --git a/src/mesa/main/stencil.c b/src/mesa/main/stencil.c

index 2a19a17..409b2f0 100644 (file)
--- a/src/mesa/main/stencil.c
+++ b/src/mesa/main/stencil.c
@@ -124,8 +124,8 @@ _mesa_ClearStencil( GLint s )
   * \sa glStencilFunc().
   *
   * Verifies the parameters and updates the respective values in
- * __struct gl_contextRec::Stencil. On change flushes the vertices and notifies the
- * driver via the dd_function_table::StencilFunc callback.
+ * __struct gl_contextRec::Stencil. On change flushes the vertices and notifies
+ * the driver via the dd_function_table::StencilFunc callback.
   */
  void GLAPIENTRY
  _mesa_StencilFuncSeparateATI( GLenum frontfunc, GLenum backfunc, GLint ref, GLuint mask )
@@ -178,8 +178,8 @@ _mesa_StencilFuncSeparateATI( GLenum frontfunc, GLenum backfunc, GLint ref, GLui
   * \sa glStencilFunc().
   *
   * Verifies the parameters and updates the respective values in
- * __struct gl_contextRec::Stencil. On change flushes the vertices and notifies the
- * driver via the dd_function_table::StencilFunc callback.
+ * __struct gl_contextRec::Stencil. On change flushes the vertices and notifies
+ * the driver via the dd_function_table::StencilFunc callback.
   */
  void GLAPIENTRY
  _mesa_StencilFunc( GLenum func, GLint ref, GLuint mask )
@@ -298,8 +298,8 @@ _mesa_StencilMask( GLuint mask )
   * \sa glStencilOp().
   * 
   * Verifies the parameters and updates the respective fields in
- * __struct gl_contextRec::Stencil. On change flushes the vertices and notifies the
- * driver via the dd_function_table::StencilOp callback.
+ * __struct gl_contextRec::Stencil. On change flushes the vertices and notifies
+ * the driver via the dd_function_table::StencilOp callback.
   */
  void GLAPIENTRY
  _mesa_StencilOp(GLenum fail, GLenum zfail, GLenum zpass)
@@ -389,12 +389,6 @@ _mesa_ActiveStencilFaceEXT(GLenum face)
  
  
  
-/**
- * OpenGL 2.0 function.
- * \todo Make StencilOp() call this function.  And eventually remove the
- * ctx->Driver.StencilOp function and use ctx->Driver.StencilOpSeparate
- * instead.
- */
  void GLAPIENTRY
  _mesa_StencilOpSeparate(GLenum face, GLenum sfail, GLenum zfail, GLenum zpass)
  {
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp

index d288b1d..7610bcb 100644 (file)
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -1844,6 +1844,10 @@ const struct function gl_core_functions_possible[] = {
     { "glGetQueryBufferObjecti64v", 45, -1 },
     { "glGetQueryBufferObjectui64v", 45, -1 },
  
+   /* GL_ARB_indirect_parameters */
+   { "glMultiDrawArraysIndirectCountARB", 31, -1 },
+   { "glMultiDrawElementsIndirectCountARB", 31, -1 },
+
     { NULL, 0, -1 }
  };
  
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c

index 73b3318..50141be 100644 (file)
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -2247,6 +2247,22 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
                       _mesa_enum_to_string(internalFormat));
           return GL_TRUE;
        }
+   } else {
+      /*
+       * Section 8.6 (Alternate Texture Image Specification Commands) of the
+       * OpenGL 4.5 (Compatibility Profile) spec says:
+       *
+       *     "Parameters level, internalformat, and border are specified using
+       *     the same values, with the same meanings, as the corresponding
+       *     arguments of TexImage2D, except that internalformat may not be
+       *     specified as 1, 2, 3, or 4."
+       */
+      if (internalFormat >= 1 && internalFormat <= 4) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glCopyTexImage%dD(internalFormat=%d)", dimensions,
+                     internalFormat);
+         return GL_TRUE;
+      }
     }
  
     baseFormat = _mesa_base_tex_format(ctx, internalFormat);
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c

index 547055e..b107a8f 100644 (file)
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -835,7 +835,7 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
                    incomplete(t, MIPMAP, "TexImage[%d] is missing", i);
                    return;
                 }
-               if (img->TexFormat != baseImage->TexFormat) {
+               if (img->InternalFormat != baseImage->InternalFormat) {
                    incomplete(t, MIPMAP, "Format[i] != Format[baseLevel]");
                    return;
                 }
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp

index b2ac65f..766a465 100644 (file)
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -437,7 +437,7 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
                   dst[didx].i = src[sidx].i ? 1 : 0;
                   break;
                case GLSL_TYPE_DOUBLE:
-                 dst[didx].i = *(double *)&src[sidx].f;
+                 dst[didx].i = IROUNDD(*(double *)&src[sidx].f);
                   break;
                default:
                   assert(!"Should not get here.");
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c

index 758ca24..47f80ce 100644 (file)
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1002,10 +1002,10 @@ _mesa_UniformBlockBinding(GLuint program,
     if (!shProg)
        return;
  
-   if (uniformBlockIndex >= shProg->NumBufferInterfaceBlocks) {
+   if (uniformBlockIndex >= shProg->NumUniformBlocks) {
        _mesa_error(ctx, GL_INVALID_VALUE,
                   "glUniformBlockBinding(block index %u >= %u)",
-                 uniformBlockIndex, shProg->NumBufferInterfaceBlocks);
+                 uniformBlockIndex, shProg->NumUniformBlocks);
        return;
     }
  
@@ -1016,17 +1016,22 @@ _mesa_UniformBlockBinding(GLuint program,
        return;
     }
  
-   if (shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding !=
+   if (shProg->UniformBlocks[uniformBlockIndex]->Binding !=
         uniformBlockBinding) {
        int i;
  
        FLUSH_VERTICES(ctx, 0);
        ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
  
-      shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding;
+      const int interface_block_index =
+         shProg->UboInterfaceBlockIndex[uniformBlockIndex];
+
+      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+         uniformBlockBinding;
  
        for (i = 0; i < MESA_SHADER_STAGES; i++) {
-        int stage_index = shProg->InterfaceBlockStageIndex[i][uniformBlockIndex];
+        int stage_index =
+            shProg->InterfaceBlockStageIndex[i][interface_block_index];
  
          if (stage_index != -1) {
             struct gl_shader *sh = shProg->_LinkedShaders[i];
@@ -1054,10 +1059,10 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
     if (!shProg)
        return;
  
-   if (shaderStorageBlockIndex >= shProg->NumBufferInterfaceBlocks) {
+   if (shaderStorageBlockIndex >= shProg->NumShaderStorageBlocks) {
        _mesa_error(ctx, GL_INVALID_VALUE,
                   "glShaderStorageBlockBinding(block index %u >= %u)",
-                 shaderStorageBlockIndex, shProg->NumBufferInterfaceBlocks);
+                 shaderStorageBlockIndex, shProg->NumShaderStorageBlocks);
        return;
     }
  
@@ -1069,17 +1074,22 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
        return;
     }
  
-   if (shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding !=
+   if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding !=
         shaderStorageBlockBinding) {
        int i;
  
        FLUSH_VERTICES(ctx, 0);
        ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
  
-      shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding;
+      const int interface_block_index =
+         shProg->SsboInterfaceBlockIndex[shaderStorageBlockIndex];
+
+      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+         shaderStorageBlockBinding;
  
        for (i = 0; i < MESA_SHADER_STAGES; i++) {
-        int stage_index = shProg->InterfaceBlockStageIndex[i][shaderStorageBlockIndex];
+        int stage_index =
+            shProg->InterfaceBlockStageIndex[i][interface_block_index];
  
          if (stage_index != -1) {
             struct gl_shader *sh = shProg->_LinkedShaders[i];
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c

index e92bb11..112a73d 100644 (file)
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -433,7 +433,8 @@ compute_version_es1(const struct gl_extensions *extensions)
  }
  
  static GLuint
-compute_version_es2(const struct gl_extensions *extensions)
+compute_version_es2(const struct gl_extensions *extensions,
+                    const struct gl_constants *consts)
  {
     /* OpenGL ES 2.0 is derived from OpenGL 2.0 */
     const bool ver_2_0 = (extensions->ARB_texture_cube_map &&
@@ -464,9 +465,11 @@ compute_version_es2(const struct gl_extensions *extensions)
                           extensions->EXT_texture_snorm &&
                           extensions->NV_primitive_restart &&
                           extensions->OES_depth_texture_cube_map);
+   const bool es31_compute_shader =
+      consts->MaxComputeWorkGroupInvocations >= 128;
     const bool ver_3_1 = (ver_3_0 &&
                           extensions->ARB_arrays_of_arrays &&
-                         extensions->ARB_compute_shader &&
+                         es31_compute_shader &&
                           extensions->ARB_draw_indirect &&
                           extensions->ARB_explicit_uniform_location &&
                           extensions->ARB_framebuffer_no_attachments &&
@@ -508,7 +511,7 @@ _mesa_get_version(const struct gl_extensions *extensions,
     case API_OPENGLES:
        return compute_version_es1(extensions);
     case API_OPENGLES2:
-      return compute_version_es2(extensions);
+      return compute_version_es2(extensions, consts);
     }
     return 0;
  }
diff --git a/src/mesa/math/m_matrix.c b/src/mesa/math/m_matrix.c

index 6522200..493d0e5 100644 (file)
--- a/src/mesa/math/m_matrix.c
+++ b/src/mesa/math/m_matrix.c
@@ -131,7 +131,7 @@ static const char *types[] = {
  /**
   * Identity matrix.
   */
-static GLfloat Identity[16] = {
+static const GLfloat Identity[16] = {
     1.0, 0.0, 0.0, 0.0,
     0.0, 1.0, 0.0, 0.0,
     0.0, 0.0, 1.0, 0.0,
@@ -654,7 +654,7 @@ static GLboolean invert_matrix_3d_no_rot( GLmatrix *mat )
     if (MAT(in,0,0) == 0 || MAT(in,1,1) == 0 || MAT(in,2,2) == 0 )
        return GL_FALSE;
  
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
     MAT(out,0,0) = 1.0F / MAT(in,0,0);
     MAT(out,1,1) = 1.0F / MAT(in,1,1);
     MAT(out,2,2) = 1.0F / MAT(in,2,2);
@@ -687,7 +687,7 @@ static GLboolean invert_matrix_2d_no_rot( GLmatrix *mat )
     if (MAT(in,0,0) == 0 || MAT(in,1,1) == 0)
        return GL_FALSE;
  
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
     MAT(out,0,0) = 1.0F / MAT(in,0,0);
     MAT(out,1,1) = 1.0F / MAT(in,1,1);
  
@@ -709,7 +709,7 @@ static GLboolean invert_matrix_perspective( GLmatrix *mat )
     if (MAT(in,2,3) == 0)
        return GL_FALSE;
  
-   memcpy( out, Identity, 16 * sizeof(GLfloat) );
+   memcpy( out, Identity, sizeof(Identity) );
  
     MAT(out,0,0) = 1.0F / MAT(in,0,0);
     MAT(out,1,1) = 1.0F / MAT(in,1,1);
@@ -802,7 +802,7 @@ _math_matrix_rotate( GLmatrix *mat,
     s = sinf( angle * M_PI / 180.0 );
     c = cosf( angle * M_PI / 180.0 );
  
-   memcpy(m, Identity, sizeof(GLfloat)*16);
+   memcpy(m, Identity, sizeof(Identity));
     optimized = GL_FALSE;
  
  #define M(row,col)  m[col*4+row]
@@ -1136,8 +1136,8 @@ _math_matrix_viewport(GLmatrix *m, const float scale[3],
  void
  _math_matrix_set_identity( GLmatrix *mat )
  {
-   memcpy( mat->m, Identity, 16*sizeof(GLfloat) );
-   memcpy( mat->inv, Identity, 16*sizeof(GLfloat) );
+   memcpy( mat->m, Identity, sizeof(Identity) );
+   memcpy( mat->inv, Identity, sizeof(Identity) );
  
     mat->type = MATRIX_IDENTITY;
     mat->flags &= ~(MAT_DIRTY_FLAGS|
@@ -1437,7 +1437,7 @@ _math_matrix_is_dirty( const GLmatrix *m )
  void
  _math_matrix_copy( GLmatrix *to, const GLmatrix *from )
  {
-   memcpy( to->m, from->m, sizeof(Identity) );
+   memcpy(to->m, from->m, 16 * sizeof(GLfloat));
     memcpy(to->inv, from->inv, 16 * sizeof(GLfloat));
     to->flags = from->flags;
     to->type = from->type;
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp

index a0a42da..9da9733 100644 (file)
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -541,6 +541,7 @@ type_size(const struct glsl_type *type)
     case GLSL_TYPE_VOID:
     case GLSL_TYPE_ERROR:
     case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_FUNCTION:
        assert(!"Invalid type in type_size");
        break;
     }
@@ -1113,7 +1114,13 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
        if (ir->operands[0]->type->is_vector() ||
           ir->operands[1]->type->is_vector()) {
          src_reg temp = get_temp(glsl_type::vec4_type);
-        emit(ir, OPCODE_SNE, dst_reg(temp), op[0], op[1]);
+         if (ir->operands[0]->type->is_boolean() &&
+             ir->operands[1]->as_constant() &&
+             ir->operands[1]->as_constant()->is_zero()) {
+            temp = op[0];
+         } else {
+            emit(ir, OPCODE_SNE, dst_reg(temp), op[0], op[1]);
+         }
  
          /* After the dot-product, the value will be an integer on the
           * range [0,4].  Zero stays zero, and positive values become 1.0.
@@ -1139,32 +1146,6 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
        }
        break;
  
-   case ir_unop_any: {
-      assert(ir->operands[0]->type->is_vector());
-
-      /* After the dot-product, the value will be an integer on the
-       * range [0,4].  Zero stays zero, and positive values become 1.0.
-       */
-      ir_to_mesa_instruction *const dp =
-        emit_dp(ir, result_dst, op[0], op[0],
-                ir->operands[0]->type->vector_elements);
-      if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
-        /* The clamping to [0,1] can be done for free in the fragment
-         * shader with a saturate.
-         */
-        dp->saturate = true;
-      } else {
-        /* Negating the result of the dot-product gives values on the range
-         * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
-         * is achieved using SLT.
-         */
-        src_reg slt_src = result_src;
-        slt_src.negate = ~slt_src.negate;
-        emit(ir, OPCODE_SLT, result_dst, slt_src, src_reg_for_float(0.0));
-      }
-      break;
-   }
-
     case ir_binop_logic_xor:
        emit(ir, OPCODE_SNE, result_dst, op[0], op[1]);
        break;
@@ -1323,9 +1304,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
        break;
  
     case ir_binop_vector_extract:
-   case ir_binop_bfm:
     case ir_triop_fma:
-   case ir_triop_bfi:
     case ir_triop_bitfield_extract:
     case ir_triop_vector_insert:
     case ir_quadop_bitfield_insert:
@@ -2468,6 +2447,7 @@ _mesa_associate_uniform_storage(struct gl_context *ctx,
           case GLSL_TYPE_STRUCT:
           case GLSL_TYPE_ERROR:
           case GLSL_TYPE_INTERFACE:
+         case GLSL_TYPE_FUNCTION:
             assert(!"Should not get here.");
             break;
          }
diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c

index 53e9813..e98946b 100644 (file)
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -89,6 +89,37 @@ _mesa_free_parameter_list(struct gl_program_parameter_list *paramList)
  
  
  /**
+ * Make sure there are enough unused parameter slots. Reallocate the list
+ * if needed.
+ *
+ * \param paramList        where to reserve parameter slots
+ * \param reserve_slots    number of slots to reserve
+ */
+void
+_mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList,
+                                unsigned reserve_slots)
+{
+   const GLuint oldNum = paramList->NumParameters;
+
+   if (oldNum + reserve_slots > paramList->Size) {
+      /* Need to grow the parameter list array (alloc some extra) */
+      paramList->Size = paramList->Size + 4 * reserve_slots;
+
+      /* realloc arrays */
+      paramList->Parameters =
+         realloc(paramList->Parameters,
+                 paramList->Size * sizeof(struct gl_program_parameter));
+
+      paramList->ParameterValues = (gl_constant_value (*)[4])
+         _mesa_align_realloc(paramList->ParameterValues,         /* old buf */
+                             oldNum * 4 * sizeof(gl_constant_value),/* old sz */
+                             paramList->Size*4*sizeof(gl_constant_value),/*new*/
+                             16);
+   }
+}
+
+
+/**
   * Add a new parameter to a parameter list.
   * Note that parameter values are usually 4-element GLfloat vectors.
   * When size > 4 we'll allocate a sequential block of parameters to
@@ -115,21 +146,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList,
  
     assert(size > 0);
  
-   if (oldNum + sz4 > paramList->Size) {
-      /* Need to grow the parameter list array (alloc some extra) */
-      paramList->Size = paramList->Size + 4 * sz4;
-
-      /* realloc arrays */
-      paramList->Parameters =
-         realloc(paramList->Parameters,
-                 paramList->Size * sizeof(struct gl_program_parameter));
-
-      paramList->ParameterValues = (gl_constant_value (*)[4])
-         _mesa_align_realloc(paramList->ParameterValues,         /* old buf */
-                             oldNum * 4 * sizeof(gl_constant_value),/* old sz */
-                             paramList->Size*4*sizeof(gl_constant_value),/*new*/
-                             16);
-   }
+   _mesa_reserve_parameter_storage(paramList, sz4);
  
     if (!paramList->Parameters ||
         !paramList->ParameterValues) {
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h

index 74a5fd9..44700b7 100644 (file)
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -112,6 +112,10 @@ _mesa_num_parameters(const struct gl_program_parameter_list *list)
     return list ? list->NumParameters : 0;
  }
  
+extern void
+_mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList,
+                                unsigned reserve_slots);
+
  extern GLint
  _mesa_add_parameter(struct gl_program_parameter_list *paramList,
                      gl_register_file type, const char *name,
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c

index bdb335e..12490d0 100644 (file)
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -474,7 +474,7 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[],
            * single MAD.
            * linear: fogcoord * -1/(end-start) + end/(end-start)
            * exp: 2^-(density/ln(2) * fogcoord)
-          * exp2: 2^-((density/(ln(2)^2) * fogcoord)^2)
+          * exp2: 2^-((density/(sqrt(ln(2))) * fogcoord)^2)
            */
           value[0] = (ctx->Fog.End == ctx->Fog.Start)
              ? 1.0f : (GLfloat)(-1.0F / (ctx->Fog.End - ctx->Fog.Start));
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c

index 539e3c0..72c9e97 100644 (file)
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -554,8 +554,8 @@ static void
  ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
  {
     nir_ssa_def *cmp = b->shader->options->native_integers ?
-      nir_bany4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0))) :
-      nir_fany4(b, nir_slt(b, src[0], nir_imm_float(b, 0.0)));
+      nir_bany_inequal4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_int(b, 0)) :
+      nir_fany_nequal4(b, nir_slt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_float(b, 0.0));
  
     nir_intrinsic_instr *discard =
        nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
@@ -609,6 +609,7 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src,
     instr->op = op;
     instr->dest_type = nir_type_float;
     instr->is_shadow = prog_inst->TexShadow;
+   instr->texture_index = prog_inst->TexSrcUnit;
     instr->sampler_index = prog_inst->TexSrcUnit;
  
     switch (prog_inst->TexSrcTarget) {
@@ -927,6 +928,7 @@ ptn_add_output_stores(struct ptn_compile *c)
        nir_intrinsic_instr *store =
           nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
        store->num_components = glsl_get_vector_elements(var->type);
+      store->const_index[0] = (1 << store->num_components) - 1;
        store->variables[0] =
           nir_deref_var_create(store, c->output_vars[var->data.location]);
  
@@ -997,6 +999,7 @@ setup_registers_and_variables(struct ptn_compile *c)
              nir_intrinsic_instr *store =
                 nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
              store->num_components = 4;
+            store->const_index[0] = WRITEMASK_XYZW;
              store->variables[0] = nir_deref_var_create(store, fullvar);
              store->src[0] = nir_src_for_ssa(f001);
              nir_builder_instr_insert(b, &store->instr);
@@ -1080,11 +1083,11 @@ prog_to_nir(const struct gl_program *prog,
     c = rzalloc(NULL, struct ptn_compile);
     if (!c)
        return NULL;
-   s = nir_shader_create(NULL, stage, options);
-   if (!s)
-      goto fail;
     c->prog = prog;
  
+   nir_builder_init_simple_shader(&c->build, NULL, stage, options);
+   s = c->build.shader;
+
     if (prog->Parameters->NumParameters > 0) {
        c->parameters = rzalloc(s, nir_variable);
        c->parameters->type =
@@ -1095,14 +1098,6 @@ prog_to_nir(const struct gl_program *prog,
        exec_list_push_tail(&s->uniforms, &c->parameters->node);
     }
  
-   nir_function *func = nir_function_create(s, "main");
-   nir_function_overload *overload = nir_function_overload_create(func);
-   nir_function_impl *impl = nir_function_impl_create(overload);
-
-   c->build.shader = s;
-   c->build.impl = impl;
-   c->build.cursor = nir_after_cf_list(&impl->body);
-
     setup_registers_and_variables(c);
     if (unlikely(c->error))
        goto fail;
diff --git a/src/mesa/program/programopt.c b/src/mesa/program/programopt.c

index af78150..24dde57 100644 (file)
--- a/src/mesa/program/programopt.c
+++ b/src/mesa/program/programopt.c
@@ -589,3 +589,30 @@ _mesa_remove_output_reads(struct gl_program *prog, gl_register_file type)
        }
     }
  }
+
+void
+_mesa_program_fragment_position_to_sysval(struct gl_program *prog)
+{
+   GLuint i;
+
+   if (prog->Target != GL_FRAGMENT_PROGRAM_ARB ||
+       !(prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_POS)))
+      return;
+
+   prog->InputsRead &= ~BITFIELD64_BIT(VARYING_SLOT_POS);
+   prog->SystemValuesRead |= 1 << SYSTEM_VALUE_FRAG_COORD;
+
+   for (i = 0; i < prog->NumInstructions; i++) {
+      struct prog_instruction *inst = prog->Instructions + i;
+      const GLuint numSrc = _mesa_num_inst_src_regs(inst->Opcode);
+      GLuint j;
+
+      for (j = 0; j < numSrc; j++) {
+         if (inst->SrcReg[j].File == PROGRAM_INPUT &&
+             inst->SrcReg[j].Index == VARYING_SLOT_POS) {
+            inst->SrcReg[j].File = PROGRAM_SYSTEM_VALUE;
+            inst->SrcReg[j].Index = SYSTEM_VALUE_FRAG_COORD;
+         }
+      }
+   }
+}
diff --git a/src/mesa/program/programopt.h b/src/mesa/program/programopt.h

index 757421e..1520d16 100644 (file)
--- a/src/mesa/program/programopt.h
+++ b/src/mesa/program/programopt.h
@@ -51,6 +51,8 @@ _mesa_count_texture_instructions(struct gl_program *prog);
  extern void
  _mesa_remove_output_reads(struct gl_program *prog, gl_register_file type);
  
+extern void
+_mesa_program_fragment_position_to_sysval(struct gl_program *prog);
  
  #ifdef __cplusplus
  }
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c

index 43dbadd..0309722 100644 (file)
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -33,7 +33,6 @@
  #include "pipe/p_defines.h"
  #include "st_context.h"
  #include "st_atom.h"
-#include "st_cb_bitmap.h"
  #include "st_program.h"
  #include "st_manager.h"
  
@@ -96,27 +95,26 @@ void st_destroy_atoms( struct st_context *st )
  }
  
  
-/***********************************************************************
- */
  
-static GLboolean check_state( const struct st_state_flags *a,
-                             const struct st_state_flags *b )
+static bool
+check_state(const struct st_state_flags *a, const struct st_state_flags *b)
  {
-   return ((a->mesa & b->mesa) ||
-          (a->st & b->st));
+   return (a->mesa & b->mesa) || (a->st & b->st);
  }
  
-static void accumulate_state( struct st_state_flags *a,
-                             const struct st_state_flags *b )
+
+static void
+accumulate_state(struct st_state_flags *a, const struct st_state_flags *b)
  {
     a->mesa |= b->mesa;
     a->st |= b->st;
  }
  
  
-static void xor_states( struct st_state_flags *result,
-                            const struct st_state_flags *a,
-                             const struct st_state_flags *b )
+static void
+xor_states(struct st_state_flags *result,
+           const struct st_state_flags *a,
+           const struct st_state_flags *b)
  {
     result->mesa = a->mesa ^ b->mesa;
     result->st = a->st ^ b->st;
@@ -181,14 +179,11 @@ void st_validate_state( struct st_context *st )
  
     check_attrib_edgeflag(st);
  
-   if (state->mesa)
-      st_flush_bitmap_cache(st);
-
     check_program_state( st );
  
     st_manager_validate_framebuffers(st);
  
-   if (state->st == 0)
+   if (state->st == 0 && state->mesa == 0)
        return;
  
     /*printf("%s %x/%x\n", __func__, state->mesa, state->st);*/
@@ -245,6 +240,3 @@ void st_validate_state( struct st_context *st )
  
     memset(state, 0, sizeof(*state));
  }
-
-
-
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c

index 20f8b3d..66811d2 100644 (file)
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -84,6 +84,7 @@ void st_upload_constants( struct st_context *st,
           cb.buffer = NULL;
           cb.user_buffer = NULL;
           u_upload_data(st->constbuf_uploader, 0, paramBytes,
+                       st->ctx->Const.UniformBufferOffsetAlignment,
                         params->ParameterValues, &cb.buffer_offset, &cb.buffer);
           u_upload_unmap(st->constbuf_uploader);
        } else {
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c

index 55d5e66..c20cadf 100644 (file)
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -220,13 +220,13 @@ static void update_raster_state( struct st_context *st )
     raster->line_smooth = ctx->Line.SmoothFlag;
     if (ctx->Line.SmoothFlag) {
        raster->line_width = CLAMP(ctx->Line.Width,
-                                ctx->Const.MinLineWidthAA,
-                                ctx->Const.MaxLineWidthAA);
+                                 ctx->Const.MinLineWidthAA,
+                                 ctx->Const.MaxLineWidthAA);
     }
     else {
        raster->line_width = CLAMP(ctx->Line.Width,
-                                ctx->Const.MinLineWidth,
-                                ctx->Const.MaxLineWidth);
+                                 ctx->Const.MinLineWidth,
+                                 ctx->Const.MaxLineWidth);
     }
  
     raster->line_stipple_enable = ctx->Line.StippleFlag;
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c

index cbc6845..d8c3dbd 100644 (file)
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -204,7 +204,7 @@ setup_bitmap_vertex_data(struct st_context *st, bool normalized,
        tBot = (GLfloat) height;
     }
  
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]),
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), 4,
                    vbuf_offset, vbuf, (void **) &vertices);
     if (!*vbuf) {
        return;
@@ -287,7 +287,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
        GLfloat colorSave[4];
        COPY_4V(colorSave, ctx->Current.Attrib[VERT_ATTRIB_COLOR0]);
        COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], color);
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
        COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], colorSave);
     }
  
@@ -404,6 +405,9 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
     cso_restore_stream_outputs(cso);
  
     pipe_resource_reference(&vbuf, NULL);
+
+   /* We uploaded modified constants, need to invalidate them. */
+   st->dirty.mesa |= _NEW_PROGRAM_CONSTANTS;
  }
  
  
@@ -490,15 +494,15 @@ st_flush_bitmap_cache(struct st_context *st)
  
        assert(cache->xmin <= cache->xmax);
  
-/*    printf("flush size %d x %d  at %d, %d\n",
-             cache->xmax - cache->xmin,
-             cache->ymax - cache->ymin,
-             cache->xpos, cache->ypos);
-*/
+      if (0)
+         printf("flush bitmap, size %d x %d  at %d, %d\n",
+                cache->xmax - cache->xmin,
+                cache->ymax - cache->ymin,
+                cache->xpos, cache->ypos);
  
        /* The texture transfer has been mapped until now.
-          * So unmap and release the texture transfer before drawing.
-          */
+       * So unmap and release the texture transfer before drawing.
+       */
        if (cache->trans && cache->buffer) {
           if (0)
              print_cache(cache);
@@ -615,10 +619,17 @@ st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
     struct st_context *st = st_context(ctx);
     struct pipe_resource *pt;
  
-   if (width == 0 || height == 0)
-      return;
+   assert(width > 0);
+   assert(height > 0);
  
-   st_validate_state(st);
+   /* We only need to validate state of the st dirty flags are set or
+    * any non-_NEW_PROGRAM_CONSTANTS mesa flags are set.  The VS we use
+    * for bitmap drawing uses no constants and the FS constants are
+    * explicitly uploaded in the draw_bitmap_quad() function.
+    */
+   if ((st->dirty.mesa & ~_NEW_PROGRAM_CONSTANTS) || st->dirty.st) {
+      st_validate_state(st);
+   }
  
     if (!st->bitmap.vs) {
        /* create pass-through vertex shader now */
diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c

index 5d20b26..68be8ba 100644 (file)
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -83,9 +83,7 @@ st_bufferobj_free(struct gl_context *ctx, struct gl_buffer_object *obj)
     if (st_obj->buffer)
        pipe_resource_reference(&st_obj->buffer, NULL);
  
-   mtx_destroy(&st_obj->Base.Mutex);
-   free(st_obj->Base.Label);
-   free(st_obj);
+   _mesa_delete_buffer_object(ctx, obj);
  }
  
  
@@ -184,25 +182,31 @@ st_bufferobj_data(struct gl_context *ctx,
  {
     struct st_context *st = st_context(ctx);
     struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
     struct st_buffer_object *st_obj = st_buffer_object(obj);
     unsigned bind, pipe_usage, pipe_flags = 0;
  
     if (target != GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD &&
-       size && data && st_obj->buffer &&
+       size && st_obj->buffer &&
         st_obj->Base.Size == size &&
         st_obj->Base.Usage == usage &&
         st_obj->Base.StorageFlags == storageFlags) {
-      /* Just discard the old contents and write new data.
-       * This should be the same as creating a new buffer, but we avoid
-       * a lot of validation in Mesa.
-       */
-      struct pipe_box box;
-
-      u_box_1d(0, size, &box);
-      pipe->transfer_inline_write(pipe, st_obj->buffer, 0,
-                                  PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE,
-                                  &box, data, 0, 0);
-      return GL_TRUE;
+      if (data) {
+         /* Just discard the old contents and write new data.
+          * This should be the same as creating a new buffer, but we avoid
+          * a lot of validation in Mesa.
+          */
+         struct pipe_box box;
+
+         u_box_1d(0, size, &box);
+         pipe->transfer_inline_write(pipe, st_obj->buffer, 0,
+                                    PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE,
+                                    &box, data, 0, 0);
+         return GL_TRUE;
+      } else if (screen->get_param(screen, PIPE_CAP_INVALIDATE_BUFFER)) {
+         pipe->invalidate_resource(pipe, st_obj->buffer);
+         return GL_TRUE;
+      }
     }
  
     st_obj->Base.Size = size;
@@ -230,6 +234,7 @@ st_bufferobj_data(struct gl_context *ctx,
        bind = PIPE_BIND_CONSTANT_BUFFER;
        break;
     case GL_DRAW_INDIRECT_BUFFER:
+   case GL_PARAMETER_BUFFER_ARB:
        bind = PIPE_BIND_COMMAND_ARGS_BUFFER;
        break;
     default:
@@ -289,7 +294,6 @@ st_bufferobj_data(struct gl_context *ctx,
     }
  
     if (size != 0) {
-      struct pipe_screen *screen = pipe->screen;
        struct pipe_resource buffer;
  
        memset(&buffer, 0, sizeof buffer);
@@ -329,6 +333,31 @@ st_bufferobj_data(struct gl_context *ctx,
  
  
  /**
+ * Called via glInvalidateBuffer(Sub)Data.
+ */
+static void
+st_bufferobj_invalidate(struct gl_context *ctx,
+                        struct gl_buffer_object *obj,
+                        GLintptr offset,
+                        GLsizeiptr size)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+
+   /* We ignore partial invalidates. */
+   if (offset != 0 || size != obj->Size)
+      return;
+
+   /* Nothing to invalidate. */
+   if (!st_obj->buffer)
+      return;
+
+   pipe->invalidate_resource(pipe, st_obj->buffer);
+}
+
+
+/**
   * Called via glMapBufferRange().
   */
  static void *
@@ -513,7 +542,8 @@ st_bufferobj_validate_usage(struct st_context *st,
  
  
  void
-st_init_bufferobject_functions(struct dd_function_table *functions)
+st_init_bufferobject_functions(struct pipe_screen *screen,
+                               struct dd_function_table *functions)
  {
     /* plug in default driver fallbacks (such as for ClearBufferSubData) */
     _mesa_init_buffer_object_functions(functions);
@@ -528,4 +558,7 @@ st_init_bufferobject_functions(struct dd_function_table *functions)
     functions->UnmapBuffer = st_bufferobj_unmap;
     functions->CopyBufferSubData = st_copy_buffer_subdata;
     functions->ClearBufferSubData = st_clear_buffer_subdata;
+
+   if (screen->get_param(screen, PIPE_CAP_INVALIDATE_BUFFER))
+      functions->InvalidateBufferSubData = st_bufferobj_invalidate;
  }
diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.h b/src/mesa/state_tracker/st_cb_bufferobjects.h

index 647efe4..54e6a21 100644 (file)
--- a/src/mesa/state_tracker/st_cb_bufferobjects.h
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.h
@@ -33,6 +33,7 @@
  
  struct dd_function_table;
  struct pipe_resource;
+struct pipe_screen;
  struct st_context;
  
  /**
@@ -62,7 +63,8 @@ st_bufferobj_validate_usage(struct st_context *st,
  
  
  extern void
-st_init_bufferobject_functions(struct dd_function_table *functions);
+st_init_bufferobject_functions(struct pipe_screen *screen,
+                               struct dd_function_table *functions);
  
  
  #endif
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c

index 18efd14..7b6d10e 100644 (file)
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -41,6 +41,7 @@
  #include "program/prog_instruction.h"
  #include "st_context.h"
  #include "st_atom.h"
+#include "st_cb_bitmap.h"
  #include "st_cb_clear.h"
  #include "st_cb_fbo.h"
  #include "st_format.h"
@@ -184,7 +185,7 @@ draw_quad(struct st_context *st,
  
     vb.stride = 8 * sizeof(float);
  
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]),
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), 4,
                    &vb.buffer_offset, &vb.buffer,
                    (void **) &vertices);
     if (!vb.buffer) {
@@ -466,6 +467,8 @@ st_Clear(struct gl_context *ctx, GLbitfield mask)
     GLbitfield clear_buffers = 0x0;
     GLuint i;
  
+   st_flush_bitmap_cache(st);
+
     /* This makes sure the pipe has the latest scissor, etc values */
     st_validate_state( st );
  
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c

index 262ad80..04a9de0 100644 (file)
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -50,6 +50,7 @@
  
  #include "st_atom.h"
  #include "st_atom_constbuf.h"
+#include "st_cb_bitmap.h"
  #include "st_cb_drawpixels.h"
  #include "st_cb_readpixels.h"
  #include "st_cb_fbo.h"
@@ -457,7 +458,7 @@ draw_quad(struct gl_context *ctx, GLfloat x0, GLfloat y0, GLfloat z,
     struct pipe_resource *buf = NULL;
     unsigned offset;
  
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(verts[0]), &offset,
+   u_upload_alloc(st->uploader, 0, 4 * sizeof(verts[0]), 4, &offset,
                    &buf, (void **) &verts);
     if (!buf) {
        return;
@@ -1063,6 +1064,8 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
     /* Mesa state should be up to date by now */
     assert(ctx->NewState == 0x0);
  
+   st_flush_bitmap_cache(st);
+
     st_validate_state(st);
  
     /* Limit the size of the glDrawPixels to the max texture size.
@@ -1110,8 +1113,11 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
           num_sampler_view++;
        }
  
-      /* update fragment program constants */
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      /* compiling a new fragment shader variant added new state constants
+       * into the constant buffer, we need to update them
+       */
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
     }
  
     /* Put glDrawPixels image into a texture */
@@ -1296,6 +1302,7 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
         ctx->_ImageTransferState == 0x0 &&
         !ctx->Color.BlendEnabled &&
         !ctx->Color.AlphaEnabled &&
+       (!ctx->Color.ColorLogicOpEnabled || ctx->Color.LogicOp == GL_COPY) &&
         !ctx->Depth.Test &&
         !ctx->Fog.Enabled &&
         !ctx->Stencil.Enabled &&
@@ -1419,6 +1426,8 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
     GLint readX, readY, readW, readH;
     struct gl_pixelstore_attrib pack = ctx->DefaultPacking;
  
+   st_flush_bitmap_cache(st);
+
     st_validate_state(st);
  
     if (type == GL_DEPTH_STENCIL) {
@@ -1463,8 +1472,11 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
           num_sampler_view++;
        }
  
-      /* update fragment program constants */
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      /* compiling a new fragment shader variant added new state constants
+       * into the constant buffer, we need to update them
+       */
+      st_upload_constants(st, st->fp->Base.Base.Parameters,
+                          PIPE_SHADER_FRAGMENT);
     }
     else {
        assert(type == GL_DEPTH);
diff --git a/src/mesa/state_tracker/st_cb_drawtex.c b/src/mesa/state_tracker/st_cb_drawtex.c

index 2634b09..e6ab77f 100644 (file)
--- a/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/src/mesa/state_tracker/st_cb_drawtex.c
@@ -21,6 +21,7 @@
  
  #include "st_context.h"
  #include "st_atom.h"
+#include "st_cb_bitmap.h"
  #include "st_cb_drawtex.h"
  
  #include "pipe/p_context.h"
@@ -113,6 +114,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
     struct pipe_vertex_element velements[2 + MAX_TEXTURE_UNITS];
     unsigned offset;
  
+   st_flush_bitmap_cache(st);
+
     st_validate_state(st);
  
     /* determine if we need vertex color */
@@ -150,7 +153,7 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
        GLuint attr;
  
        u_upload_alloc(st->uploader, 0,
-                     numAttribs * 4 * 4 * sizeof(GLfloat),
+                     numAttribs * 4 * 4 * sizeof(GLfloat), 4,
                       &offset, &vbuffer, (void **) &vbuf);
        if (!vbuffer) {
           return;
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c

index 62f149a..f8b3679 100644 (file)
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -388,6 +388,57 @@ guess_base_level_size(GLenum target,
  
  
  /**
+ * Try to determine whether we should allocate memory for a full texture
+ * mipmap.  The problem is when we get a glTexImage(level=0) call, we
+ * can't immediately know if other mipmap levels are coming next.  Here
+ * we try to guess whether to allocate memory for a mipmap or just the
+ * 0th level.
+ *
+ * If we guess incorrectly here we'll later reallocate the right amount of
+ * memory either in st_AllocTextureImageBuffer() or st_finalize_texture().
+ *
+ * \param stObj  the texture object we're going to allocate memory for.
+ * \param stImage  describes the incoming image which we need to store.
+ */
+static boolean
+allocate_full_mipmap(const struct st_texture_object *stObj,
+                     const struct st_texture_image *stImage)
+{
+   switch (stObj->base.Target) {
+   case GL_TEXTURE_RECTANGLE_NV:
+   case GL_TEXTURE_BUFFER:
+   case GL_TEXTURE_EXTERNAL_OES:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      /* these texture types cannot be mipmapped */
+      return FALSE;
+   }
+
+   if (stImage->base.Level > 0 || stObj->base.GenerateMipmap)
+      return TRUE;
+
+   if (stImage->base._BaseFormat == GL_DEPTH_COMPONENT ||
+       stImage->base._BaseFormat == GL_DEPTH_STENCIL_EXT)
+      /* depth/stencil textures are seldom mipmapped */
+      return FALSE;
+
+   if (stObj->base.BaseLevel == 0 && stObj->base.MaxLevel == 0)
+      return FALSE;
+
+   if (stObj->base.Sampler.MinFilter == GL_NEAREST ||
+       stObj->base.Sampler.MinFilter == GL_LINEAR)
+      /* not a mipmap minification filter */
+      return FALSE;
+
+   if (stObj->base.Target == GL_TEXTURE_3D)
+      /* 3D textures are seldom mipmapped */
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/**
   * Try to allocate a pipe_resource object for the given st_texture_object.
   *
   * We use the given st_texture_image as a clue to determine the size of the
@@ -431,22 +482,15 @@ guess_and_alloc_texture(struct st_context *st,
      * to re-allocating a texture buffer with space for more (or fewer)
      * mipmap levels later.
      */
-   if ((stObj->base.Sampler.MinFilter == GL_NEAREST ||
-        stObj->base.Sampler.MinFilter == GL_LINEAR ||
-        (stObj->base.BaseLevel == 0 &&
-         stObj->base.MaxLevel == 0) ||
-        stImage->base._BaseFormat == GL_DEPTH_COMPONENT ||
-        stImage->base._BaseFormat == GL_DEPTH_STENCIL_EXT) &&
-       !stObj->base.GenerateMipmap &&
-       stImage->base.Level == 0) {
-      /* only alloc space for a single mipmap level */
-      lastLevel = 0;
-   }
-   else {
+   if (allocate_full_mipmap(stObj, stImage)) {
        /* alloc space for a full mipmap */
        lastLevel = _mesa_get_tex_max_num_levels(stObj->base.Target,
                                                 width, height, depth) - 1;
     }
+   else {
+      /* only alloc space for a single mipmap level */
+      lastLevel = 0;
+   }
  
     /* Save the level=0 dimensions */
     stObj->width0 = width;
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c

index 1459f25..4add50e 100644 (file)
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -80,9 +80,26 @@ DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE)
  
  
  /**
+ * Called via ctx->Driver.Enable()
+ */
+static void st_Enable(struct gl_context * ctx, GLenum cap, GLboolean state)
+{
+   struct st_context *st = st_context(ctx);
+
+   switch (cap) {
+   case GL_DEBUG_OUTPUT:
+      st_enable_debug_output(st, state);
+      break;
+   default:
+      break;
+   }
+}
+
+
+/**
   * Called via ctx->Driver.UpdateState()
   */
-void st_invalidate_state(struct gl_context * ctx, GLuint new_state)
+void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state)
  {
     struct st_context *st = st_context(ctx);
  
@@ -172,20 +189,19 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
     /* Create upload manager for vertex data for glBitmap, glDrawPixels,
      * glClear, etc.
      */
-   st->uploader = u_upload_create(st->pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+   st->uploader = u_upload_create(st->pipe, 65536, PIPE_BIND_VERTEX_BUFFER,
+                                  PIPE_USAGE_STREAM);
  
     if (!screen->get_param(screen, PIPE_CAP_USER_INDEX_BUFFERS)) {
-      st->indexbuf_uploader = u_upload_create(st->pipe, 128 * 1024, 4,
-                                              PIPE_BIND_INDEX_BUFFER);
+      st->indexbuf_uploader = u_upload_create(st->pipe, 128 * 1024,
+                                              PIPE_BIND_INDEX_BUFFER,
+                                              PIPE_USAGE_STREAM);
     }
  
-   if (!screen->get_param(screen, PIPE_CAP_USER_CONSTANT_BUFFERS)) {
-      unsigned alignment =
-         screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
-
-      st->constbuf_uploader = u_upload_create(pipe, 128 * 1024, alignment,
-                                              PIPE_BIND_CONSTANT_BUFFER);
-   }
+   if (!screen->get_param(screen, PIPE_CAP_USER_CONSTANT_BUFFERS))
+      st->constbuf_uploader = u_upload_create(pipe, 128 * 1024,
+                                              PIPE_BIND_CONSTANT_BUFFER,
+                                              PIPE_USAGE_STREAM);
  
     st->cso_context = cso_create_context(pipe);
  
@@ -249,6 +265,10 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
            PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600));
     st->has_time_elapsed =
        screen->get_param(screen, PIPE_CAP_QUERY_TIME_ELAPSED);
+   st->has_half_float_packing =
+      screen->get_param(screen, PIPE_CAP_TGSI_PACK_HALF_FLOAT);
+   st->has_multi_draw_indirect =
+      screen->get_param(screen, PIPE_CAP_MULTI_DRAW_INDIRECT);
  
     /* GL limits and extensions */
     st_init_limits(st->pipe->screen, &ctx->Const, &ctx->Extensions);
@@ -426,7 +446,7 @@ void st_init_driver_functions(struct pipe_screen *screen,
     _mesa_init_sampler_object_functions(functions);
  
     st_init_blit_functions(functions);
-   st_init_bufferobject_functions(functions);
+   st_init_bufferobject_functions(screen, functions);
     st_init_clear_functions(functions);
     st_init_bitmap_functions(functions);
     st_init_copy_image_functions(functions);
@@ -456,5 +476,6 @@ void st_init_driver_functions(struct pipe_screen *screen,
  
     st_init_vdpau_functions(functions);
  
+   functions->Enable = st_Enable;
     functions->UpdateState = st_invalidate_state;
  }
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h

index 60a9a4b..9db5f11 100644 (file)
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -65,8 +65,8 @@ struct u_upload_mgr;
  
  
  struct st_state_flags {
-   GLuint mesa;
-   uint64_t st;
+   GLbitfield mesa;  /**< Mask of _NEW_x flags */
+   uint64_t st;      /**< Mask of ST_NEW_x flags */
  };
  
  struct st_tracked_state {
@@ -101,6 +101,8 @@ struct st_context
     boolean prefer_blit_based_texture_transfer;
     boolean force_persample_in_shader;
     boolean has_shareable_shaders;
+   boolean has_half_float_packing;
+   boolean has_multi_draw_indirect;
  
     /**
      * If a shader can be created when we get its source.
@@ -251,7 +253,7 @@ struct st_framebuffer
  extern void st_init_driver_functions(struct pipe_screen *screen,
                                       struct dd_function_table *functions);
  
-void st_invalidate_state(struct gl_context * ctx, GLuint new_state);
+void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state);
  
  
  
diff --git a/src/mesa/state_tracker/st_copytex.c b/src/mesa/state_tracker/st_copytex.c

index d246d8b..4e0fd79 100644 (file)
--- a/src/mesa/state_tracker/st_copytex.c
+++ b/src/mesa/state_tracker/st_copytex.c
@@ -59,7 +59,7 @@ st_copy_framebuffer_to_texture(GLenum srcBuffer,
     _mesa_GetIntegerv(GL_READ_BUFFER, &readBufSave);
  
     /* Read from the winsys buffer */
-   _mesa_BindFramebuffer(GL_READ_BUFFER, 0);
+   _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
     _mesa_ReadBuffer(srcBuffer);
  
     /* copy image from pbuffer to texture */
@@ -136,5 +136,5 @@ st_copy_framebuffer_to_texture(GLenum srcBuffer,
  
     /* restore readbuffer */
     _mesa_ReadBuffer(readBufSave);
-   _mesa_BindFramebuffer(GL_READ_BUFFER, readFBOSave);
+   _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, readFBOSave);
  }
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c

index 6d859c6..134366d 100644 (file)
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -104,3 +104,75 @@ st_print_current(void)
  }
  
  
+/**
+ * Installed as pipe_debug_callback when GL_DEBUG_OUTPUT is enabled.
+ */
+static void
+st_debug_message(void *data,
+                 unsigned *id,
+                 enum pipe_debug_type ptype,
+                 const char *fmt,
+                 va_list args)
+{
+   struct st_context *st = data;
+   enum mesa_debug_source source;
+   enum mesa_debug_type type;
+   enum mesa_debug_severity severity;
+
+   switch (ptype) {
+   case PIPE_DEBUG_TYPE_OUT_OF_MEMORY:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_ERROR;
+      severity = MESA_DEBUG_SEVERITY_MEDIUM;
+      break;
+   case PIPE_DEBUG_TYPE_ERROR:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_ERROR;
+      severity = MESA_DEBUG_SEVERITY_MEDIUM;
+      break;
+   case PIPE_DEBUG_TYPE_SHADER_INFO:
+      source = MESA_DEBUG_SOURCE_SHADER_COMPILER;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_PERF_INFO:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_PERFORMANCE;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_INFO:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_FALLBACK:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_PERFORMANCE;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_CONFORMANCE:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   default:
+      unreachable("invalid debug type");
+   }
+   _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args);
+}
+
+void
+st_enable_debug_output(struct st_context *st, boolean enable)
+{
+   struct pipe_context *pipe = st->pipe;
+
+   if (!pipe->set_debug_callback)
+      return;
+
+   if (enable) {
+      struct pipe_debug_callback cb = { st_debug_message, st };
+      pipe->set_debug_callback(pipe, &cb);
+   } else {
+      pipe->set_debug_callback(pipe, NULL);
+   }
+}
diff --git a/src/mesa/state_tracker/st_debug.h b/src/mesa/state_tracker/st_debug.h

index 288eccf..ed3ead8 100644 (file)
--- a/src/mesa/state_tracker/st_debug.h
+++ b/src/mesa/state_tracker/st_debug.h
@@ -32,6 +32,8 @@
  #include "pipe/p_compiler.h"
  #include "util/u_debug.h"
  
+struct st_context;
+
  extern void
  st_print_current(void);
  
@@ -59,6 +61,8 @@ extern int ST_DEBUG;
  
  void st_debug_init( void );
  
+void st_enable_debug_output(struct st_context *st, boolean enable);
+
  static inline void
  ST_DBG( unsigned flag, const char *fmt, ... )
  {
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c

index f4b273b..03788f3 100644 (file)
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -48,6 +48,7 @@
  
  #include "st_context.h"
  #include "st_atom.h"
+#include "st_cb_bitmap.h"
  #include "st_cb_bufferobjects.h"
  #include "st_cb_xformfb.h"
  #include "st_debug.h"
@@ -107,7 +108,7 @@ setup_index_buffer(struct st_context *st,
     else if (st->indexbuf_uploader) {
        /* upload indexes from user memory into a real buffer */
        u_upload_data(st->indexbuf_uploader, 0,
-                    ib->count * ibuffer->index_size, ib->ptr,
+                    ib->count * ibuffer->index_size, 4, ib->ptr,
                      &ibuffer->offset, &ibuffer->buffer);
        if (!ibuffer->buffer) {
           /* out of memory */
@@ -197,6 +198,8 @@ st_draw_vbo(struct gl_context *ctx,
     /* Mesa core state should have been validated already */
     assert(ctx->NewState == 0x0);
  
+   st_flush_bitmap_cache(st);
+
     /* Validate state. */
     if (st->dirty.st || ctx->NewDriverState) {
        st_validate_state(st);
@@ -249,13 +252,7 @@ st_draw_vbo(struct gl_context *ctx,
        }
     }
  
-   if (indirect) {
-      info.indirect = st_buffer_object(indirect)->buffer;
-
-      /* Primitive restart is not handled by the VBO module in this case. */
-      info.primitive_restart = ctx->Array._PrimitiveRestart;
-      info.restart_index = ctx->Array.RestartIndex;
-   }
+   assert(!indirect);
  
     /* do actual drawing */
     for (i = 0; i < nr_prims; i++) {
@@ -266,11 +263,11 @@ st_draw_vbo(struct gl_context *ctx,
        info.instance_count = prims[i].num_instances;
        info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
        info.index_bias = prims[i].basevertex;
+      info.drawid = prims[i].draw_id;
        if (!ib) {
           info.min_index = info.start;
           info.max_index = info.start + info.count - 1;
        }
-      info.indirect_offset = prims[i].indirect_offset;
  
        if (ST_DEBUG & DEBUG_DRAW) {
           debug_printf("st/draw: mode %s  start %u  count %u  indexed %d\n",
@@ -280,7 +277,7 @@ st_draw_vbo(struct gl_context *ctx,
                        info.indexed);
        }
  
-      if (info.count_from_stream_output || info.indirect) {
+      if (info.count_from_stream_output) {
           cso_draw_vbo(st->cso_context, &info);
        }
        else if (info.primitive_restart) {
@@ -297,6 +294,84 @@ st_draw_vbo(struct gl_context *ctx,
     }
  }
  
+static void
+st_indirect_draw_vbo(struct gl_context *ctx,
+                     GLuint mode,
+                     struct gl_buffer_object *indirect_data,
+                     GLsizeiptr indirect_offset,
+                     unsigned draw_count,
+                     unsigned stride,
+                     struct gl_buffer_object *indirect_params,
+                     GLsizeiptr indirect_params_offset,
+                     const struct _mesa_index_buffer *ib)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_index_buffer ibuffer = {0};
+   struct pipe_draw_info info;
+
+   /* Mesa core state should have been validated already */
+   assert(ctx->NewState == 0x0);
+   assert(stride);
+
+   /* Validate state. */
+   if (st->dirty.st || ctx->NewDriverState) {
+      st_validate_state(st);
+   }
+
+   if (st->vertex_array_out_of_memory) {
+      return;
+   }
+
+   util_draw_init_info(&info);
+
+   if (ib) {
+      if (!setup_index_buffer(st, ib, &ibuffer)) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sDrawElementsIndirect%s",
+                     (draw_count > 1) ? "Multi" : "",
+                     indirect_params ? "CountARB" : "");
+         return;
+      }
+
+      info.indexed = TRUE;
+   }
+
+   info.mode = translate_prim(ctx, mode);
+   info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
+   info.indirect = st_buffer_object(indirect_data)->buffer;
+   info.indirect_offset = indirect_offset;
+
+   /* Primitive restart is not handled by the VBO module in this case. */
+   info.primitive_restart = ctx->Array._PrimitiveRestart;
+   info.restart_index = ctx->Array.RestartIndex;
+
+   if (ST_DEBUG & DEBUG_DRAW) {
+      debug_printf("st/draw indirect: mode %s drawcount %d indexed %d\n",
+                   u_prim_name(info.mode),
+                   draw_count,
+                   info.indexed);
+   }
+
+   if (!st->has_multi_draw_indirect) {
+      int i;
+
+      assert(!indirect_params);
+      info.indirect_count = 1;
+      for (i = 0; i < draw_count; i++) {
+         info.drawid = i;
+         cso_draw_vbo(st->cso_context, &info);
+         info.indirect_offset += stride;
+      }
+   } else {
+      info.indirect_count = draw_count;
+      info.indirect_stride = stride;
+      if (indirect_params) {
+         info.indirect_params = st_buffer_object(indirect_params)->buffer;
+         info.indirect_params_offset = indirect_params_offset;
+      }
+      cso_draw_vbo(st->cso_context, &info);
+   }
+}
+
  
  void
  st_init_draw(struct st_context *st)
@@ -304,6 +379,7 @@ st_init_draw(struct st_context *st)
     struct gl_context *ctx = st->ctx;
  
     vbo_set_draw_func(ctx, st_draw_vbo);
+   vbo_set_indirect_draw_func(ctx, st_indirect_draw_vbo);
  
     st->draw = draw_create(st->pipe); /* for selection/feedback */
  
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c

index 88c10a8..b6e6dea 100644 (file)
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -33,6 +33,7 @@
  
  #include "st_context.h"
  #include "st_atom.h"
+#include "st_cb_bitmap.h"
  #include "st_cb_bufferobjects.h"
  #include "st_draw.h"
  #include "st_program.h"
@@ -137,6 +138,8 @@ st_feedback_draw_vbo(struct gl_context *ctx,
  
     assert(draw);
  
+   st_flush_bitmap_cache(st);
+
     st_validate_state(st);
  
     if (!index_bounds_valid)
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c

index a2418e2..2a3e523 100644 (file)
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -314,10 +314,11 @@ void st_init_limits(struct pipe_screen *screen,
     c->GLSLSkipStrictMaxUniformLimitCheck =
        screen->get_param(screen, PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS);
  
+   c->UniformBufferOffsetAlignment =
+      screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
+
     if (can_ubo) {
        extensions->ARB_uniform_buffer_object = GL_TRUE;
-      c->UniformBufferOffsetAlignment =
-         screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
        c->MaxCombinedUniformBlocks = c->MaxUniformBufferBindings =
           c->Program[MESA_SHADER_VERTEX].MaxUniformBlocks +
           c->Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks +
@@ -326,6 +327,11 @@ void st_init_limits(struct pipe_screen *screen,
           c->Program[MESA_SHADER_FRAGMENT].MaxUniformBlocks;
        assert(c->MaxCombinedUniformBlocks <= MAX_COMBINED_UNIFORM_BUFFERS);
     }
+
+   c->GLSLFragCoordIsSysVal =
+      screen->get_param(screen, PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL);
+   c->GLSLFrontFacingIsSysVal =
+      screen->get_param(screen, PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL);
  }
  
  
@@ -440,34 +446,47 @@ void st_init_extensions(struct pipe_screen *screen,
        { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
        { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
        { o(ARB_clear_texture),                PIPE_CAP_CLEAR_TEXTURE                    },
+      { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
        { o(ARB_color_buffer_float),           PIPE_CAP_VERTEX_COLOR_UNCLAMPED           },
+      { o(ARB_conditional_render_inverted),  PIPE_CAP_CONDITIONAL_RENDER_INVERTED      },
        { o(ARB_copy_image),                   PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS },
        { o(ARB_depth_clamp),                  PIPE_CAP_DEPTH_CLIP_DISABLE               },
        { o(ARB_depth_texture),                PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_derivative_control),           PIPE_CAP_TGSI_FS_FINE_DERIVATIVE          },
        { o(ARB_draw_buffers_blend),           PIPE_CAP_INDEP_BLEND_FUNC                 },
+      { o(ARB_draw_indirect),                PIPE_CAP_DRAW_INDIRECT                    },
        { o(ARB_draw_instanced),               PIPE_CAP_TGSI_INSTANCEID                  },
        { o(ARB_fragment_program_shadow),      PIPE_CAP_TEXTURE_SHADOW_MAP               },
        { o(ARB_framebuffer_object),           PIPE_CAP_MIXED_FRAMEBUFFER_SIZES          },
+      { o(ARB_indirect_parameters),          PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS       },
        { o(ARB_instanced_arrays),             PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR  },
        { o(ARB_occlusion_query),              PIPE_CAP_OCCLUSION_QUERY                  },
        { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },
        { o(ARB_pipeline_statistics_query),    PIPE_CAP_QUERY_PIPELINE_STATISTICS        },
        { o(ARB_point_sprite),                 PIPE_CAP_POINT_SPRITE                     },
+      { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
        { o(ARB_seamless_cube_map),            PIPE_CAP_SEAMLESS_CUBE_MAP                },
+      { o(ARB_shader_draw_parameters),       PIPE_CAP_DRAW_PARAMETERS                  },
        { o(ARB_shader_stencil_export),        PIPE_CAP_SHADER_STENCIL_EXPORT            },
        { o(ARB_shader_texture_image_samples), PIPE_CAP_TGSI_TXQS                        },
        { o(ARB_shader_texture_lod),           PIPE_CAP_SM3                              },
        { o(ARB_shadow),                       PIPE_CAP_TEXTURE_SHADOW_MAP               },
        { o(ARB_texture_buffer_object),        PIPE_CAP_TEXTURE_BUFFER_OBJECTS           },
+      { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
        { o(ARB_texture_gather),               PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS    },
        { o(ARB_texture_mirror_clamp_to_edge), PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
+      { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
        { o(ARB_texture_non_power_of_two),     PIPE_CAP_NPOT_TEXTURES                    },
+      { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
+      { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
        { o(ARB_timer_query),                  PIPE_CAP_QUERY_TIMESTAMP                  },
        { o(ARB_transform_feedback2),          PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME       },
        { o(ARB_transform_feedback3),          PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME       },
  
        { o(EXT_blend_equation_separate),      PIPE_CAP_BLEND_EQUATION_SEPARATE          },
+      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
        { o(EXT_draw_buffers2),                PIPE_CAP_INDEP_BLEND_ENABLE               },
+      { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
        { o(EXT_stencil_two_side),             PIPE_CAP_TWO_SIDED_STENCIL                },
        { o(EXT_texture_array),                PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS         },
        { o(EXT_texture_filter_anisotropic),   PIPE_CAP_ANISOTROPIC_FILTER               },
@@ -488,17 +507,6 @@ void st_init_extensions(struct pipe_screen *screen,
        { o(OES_standard_derivatives),         PIPE_CAP_SM3                              },
        { o(OES_texture_float_linear),         PIPE_CAP_TEXTURE_FLOAT_LINEAR             },
        { o(OES_texture_half_float_linear),    PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR        },
-      { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
-      { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
-      { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
-      { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
-      { o(ARB_draw_indirect),                PIPE_CAP_DRAW_INDIRECT                    },
-      { o(ARB_derivative_control),           PIPE_CAP_TGSI_FS_FINE_DERIVATIVE          },
-      { o(ARB_conditional_render_inverted),  PIPE_CAP_CONDITIONAL_RENDER_INVERTED      },
-      { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
-      { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
-      { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
-      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
     };
  
     /* Required: render target and sampler support */
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c

index b370040..c4b3492 100644 (file)
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -80,6 +80,7 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
     struct st_texture_object *stObj = st_texture_object(texObj);
     struct pipe_resource *pt = st_get_texobj_resource(texObj);
     const uint baseLevel = texObj->BaseLevel;
+   enum pipe_format format;
     uint lastLevel, first_layer, last_layer;
     uint dstLevel;
  
@@ -149,12 +150,24 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
        last_layer = util_max_layer(pt, baseLevel);
     }
  
-   /* Try to generate the mipmap by rendering/texturing.  If that fails,
-    * use the software fallback.
+   if (stObj->surface_based)
+      format = stObj->surface_format;
+   else
+      format = pt->format;
+
+   /* First see if the driver supports hardware mipmap generation,
+    * if not then generate the mipmap by rendering/texturing.
+    * If that fails, use the software fallback.
      */
-   if (!util_gen_mipmap(st->pipe, pt, pt->format, baseLevel, lastLevel,
-                        first_layer, last_layer, PIPE_TEX_FILTER_LINEAR)) {
-      _mesa_generate_mipmap(ctx, target, texObj);
+   if (!st->pipe->screen->get_param(st->pipe->screen,
+                                    PIPE_CAP_GENERATE_MIPMAP) ||
+       !st->pipe->generate_mipmap(st->pipe, pt, format, baseLevel,
+                                  lastLevel, first_layer, last_layer)) {
+
+      if (!util_gen_mipmap(st->pipe, pt, format, baseLevel, lastLevel,
+                           first_layer, last_layer, PIPE_TEX_FILTER_LINEAR)) {
+         _mesa_generate_mipmap(ctx, target, texObj);
+      }
     }
  
     /* Fill in the Mesa gl_texture_image fields */
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp

index b6a0e6b..d424e3b 100644 (file)
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -85,6 +85,7 @@ public:
        this->has_index2 = false;
        this->double_reg2 = false;
        this->array_id = 0;
+      this->is_double_vertex_input = false;
     }
  
     st_src_reg(gl_register_file file, int index, int type)
@@ -100,6 +101,7 @@ public:
        this->has_index2 = false;
        this->double_reg2 = false;
        this->array_id = 0;
+      this->is_double_vertex_input = false;
     }
  
     st_src_reg(gl_register_file file, int index, int type, int index2D)
@@ -115,6 +117,7 @@ public:
        this->has_index2 = false;
        this->double_reg2 = false;
        this->array_id = 0;
+      this->is_double_vertex_input = false;
     }
  
     st_src_reg()
@@ -130,6 +133,7 @@ public:
        this->has_index2 = false;
        this->double_reg2 = false;
        this->array_id = 0;
+      this->is_double_vertex_input = false;
     }
  
     explicit st_src_reg(st_dst_reg reg);
@@ -150,6 +154,7 @@ public:
      */
     bool double_reg2;
     unsigned array_id;
+   bool is_double_vertex_input;
  };
  
  class st_dst_reg {
@@ -224,6 +229,7 @@ st_src_reg::st_src_reg(st_dst_reg reg)
     this->has_index2 = reg.has_index2;
     this->double_reg2 = false;
     this->array_id = reg.array_id;
+   this->is_double_vertex_input = false;
  }
  
  st_dst_reg::st_dst_reg(st_src_reg reg)
@@ -334,8 +340,24 @@ struct array_decl {
     unsigned mesa_index;
     unsigned array_id;
     unsigned array_size;
+   unsigned array_type;
  };
  
+static unsigned
+find_array_type(struct array_decl *arrays, unsigned count, unsigned array_id)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      struct array_decl *decl = &arrays[i];
+
+      if (array_id == decl->array_id) {
+         return decl->array_type;
+      }
+   }
+   return GLSL_TYPE_ERROR;
+}
+
  struct rename_reg_pair {
     int old_reg;
     int new_reg;
@@ -555,6 +577,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
  {
     glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
     int num_reladdr = 0, i, j;
+   bool dst_is_double[2];
  
     op = get_opcode(ir, op, dst, src0, src1);
  
@@ -658,7 +681,18 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
      * GLSL [0].z -> TGSI [1].xy
      * GLSL [0].w -> TGSI [1].zw
      */
-   if (inst->dst[0].type == GLSL_TYPE_DOUBLE || inst->dst[1].type == GLSL_TYPE_DOUBLE ||
+   for (j = 0; j < 2; j++) {
+      dst_is_double[j] = false;
+      if (inst->dst[j].type == GLSL_TYPE_DOUBLE)
+         dst_is_double[j] = true;
+      else if (inst->dst[j].file == PROGRAM_OUTPUT && inst->dst[j].type == GLSL_TYPE_ARRAY) {
+         unsigned type = find_array_type(this->output_arrays, this->num_output_arrays, inst->dst[j].array_id);
+         if (type == GLSL_TYPE_DOUBLE)
+            dst_is_double[j] = true;
+      }
+   }
+
+   if (dst_is_double[0] || dst_is_double[1] ||
         inst->src[0].type == GLSL_TYPE_DOUBLE) {
        glsl_to_tgsi_instruction *dinst = NULL;
        int initial_src_swz[4], initial_src_idx[4];
@@ -699,7 +733,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
  
           /* modify the destination if we are splitting */
           for (j = 0; j < 2; j++) {
-            if (dinst->dst[j].type == GLSL_TYPE_DOUBLE) {
+            if (dst_is_double[j]) {
                 dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
                 dinst->dst[j].index = initial_dst_idx[j];
                 if (i > 1)
@@ -732,7 +766,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
                    - F2D is a float src0, DLDEXP is integer src1 */
                 if (op == TGSI_OPCODE_F2D ||
                     op == TGSI_OPCODE_DLDEXP ||
-                   (op == TGSI_OPCODE_UCMP && dinst->dst[0].type == GLSL_TYPE_DOUBLE)) {
+                   (op == TGSI_OPCODE_UCMP && dst_is_double[0])) {
                    dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
                 }
              }
@@ -1057,7 +1091,7 @@ glsl_to_tgsi_visitor::st_src_reg_for_type(int type, int val)
  }
  
  static int
-type_size(const struct glsl_type *type)
+attrib_type_size(const struct glsl_type *type, bool is_vs_input)
  {
     unsigned int i;
     int size;
@@ -1080,7 +1114,7 @@ type_size(const struct glsl_type *type)
        break;
     case GLSL_TYPE_DOUBLE:
        if (type->is_matrix()) {
-         if (type->vector_elements <= 2)
+         if (type->vector_elements <= 2 || is_vs_input)
              return type->matrix_columns;
           else
              return type->matrix_columns * 2;
@@ -1088,7 +1122,7 @@ type_size(const struct glsl_type *type)
           /* For doubles if we have a double or dvec2 they fit in one
            * vec4, else they need 2 vec4s.
            */
-         if (type->vector_elements <= 2)
+         if (type->vector_elements <= 2 || is_vs_input)
              return 1;
           else
              return 2;
@@ -1096,11 +1130,11 @@ type_size(const struct glsl_type *type)
        break;
     case GLSL_TYPE_ARRAY:
        assert(type->length > 0);
-      return type_size(type->fields.array) * type->length;
+      return attrib_type_size(type->fields.array, is_vs_input) * type->length;
     case GLSL_TYPE_STRUCT:
        size = 0;
        for (i = 0; i < type->length; i++) {
-         size += type_size(type->fields.structure[i].type);
+         size += attrib_type_size(type->fields.structure[i].type, is_vs_input);
        }
        return size;
     case GLSL_TYPE_SAMPLER:
@@ -1120,6 +1154,11 @@ type_size(const struct glsl_type *type)
     return 0;
  }
  
+static int
+type_size(const struct glsl_type *type)
+{
+  return attrib_type_size(type, false);
+}
  
  /**
   * If the given GLSL type is an array or matrix or a structure containing
@@ -1413,7 +1452,7 @@ glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
     if (reg->reladdr2) emit_arl(ir, address_reg2, *reg->reladdr2);
  
     if (*num_reladdr != 1) {
-      st_src_reg temp = get_temp(glsl_type::vec4_type);
+      st_src_reg temp = get_temp(reg->type == GLSL_TYPE_DOUBLE ? glsl_type::dvec4_type : glsl_type::vec4_type);
  
        emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
        *reg = temp;
@@ -1776,89 +1815,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        }
        break;
  
-   case ir_unop_any: {
-      assert(ir->operands[0]->type->is_vector());
-
-      if (native_integers) {
-         int dst_swizzle = 0, op0_swizzle, i;
-         st_src_reg accum = op[0];
-
-         op0_swizzle = op[0].swizzle;
-         accum.swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 0),
-                                       GET_SWZ(op0_swizzle, 0),
-                                       GET_SWZ(op0_swizzle, 0),
-                                       GET_SWZ(op0_swizzle, 0));
-         for (i = 0; i < 4; i++) {
-            if (result_dst.writemask & (1 << i)) {
-               dst_swizzle = MAKE_SWIZZLE4(i, i, i, i);
-               break;
-            }
-         }
-         assert(i != 4);
-         assert(ir->operands[0]->type->is_boolean());
-
-         /* OR all the components together, since they should be either 0 or ~0
-          */
-         switch (ir->operands[0]->type->vector_elements) {
-         case 4:
-            op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 3),
-                                          GET_SWZ(op0_swizzle, 3),
-                                          GET_SWZ(op0_swizzle, 3),
-                                          GET_SWZ(op0_swizzle, 3));
-            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
-            accum = st_src_reg(result_dst);
-            accum.swizzle = dst_swizzle;
-            /* fallthrough */
-         case 3:
-            op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 2),
-                                          GET_SWZ(op0_swizzle, 2),
-                                          GET_SWZ(op0_swizzle, 2),
-                                          GET_SWZ(op0_swizzle, 2));
-            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
-            accum = st_src_reg(result_dst);
-            accum.swizzle = dst_swizzle;
-            /* fallthrough */
-         case 2:
-            op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 1),
-                                          GET_SWZ(op0_swizzle, 1),
-                                          GET_SWZ(op0_swizzle, 1),
-                                          GET_SWZ(op0_swizzle, 1));
-            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
-            break;
-         default:
-            assert(!"Unexpected vector size");
-            break;
-         }
-      } else {
-         /* After the dot-product, the value will be an integer on the
-          * range [0,4].  Zero stays zero, and positive values become 1.0.
-          */
-         glsl_to_tgsi_instruction *const dp =
-            emit_dp(ir, result_dst, op[0], op[0],
-                    ir->operands[0]->type->vector_elements);
-         if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
-             result_dst.type == GLSL_TYPE_FLOAT) {
-            /* The clamping to [0,1] can be done for free in the fragment
-             * shader with a saturate.
-             */
-            dp->saturate = true;
-         } else if (result_dst.type == GLSL_TYPE_FLOAT) {
-            /* Negating the result of the dot-product gives values on the range
-             * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
-             * is achieved using SLT.
-             */
-            st_src_reg slt_src = result_src;
-            slt_src.negate = ~slt_src.negate;
-            emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
-         }
-         else {
-            /* Use SNE 0 if integers are being used as boolean values. */
-            emit_asm(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
-         }
-      }
-      break;
-   }
-
     case ir_binop_logic_xor:
        if (native_integers)
           emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
@@ -2207,23 +2163,26 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        }
        break;
  
+   case ir_unop_pack_half_2x16:
+      emit_asm(ir, TGSI_OPCODE_PK2H, result_dst, op[0]);
+      break;
+   case ir_unop_unpack_half_2x16:
+      emit_asm(ir, TGSI_OPCODE_UP2H, result_dst, op[0]);
+      break;
+
     case ir_unop_pack_snorm_2x16:
     case ir_unop_pack_unorm_2x16:
-   case ir_unop_pack_half_2x16:
     case ir_unop_pack_snorm_4x8:
     case ir_unop_pack_unorm_4x8:
  
     case ir_unop_unpack_snorm_2x16:
     case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_half_2x16:
     case ir_unop_unpack_half_2x16_split_x:
     case ir_unop_unpack_half_2x16_split_y:
     case ir_unop_unpack_snorm_4x8:
     case ir_unop_unpack_unorm_4x8:
  
     case ir_binop_pack_half_2x16_split:
-   case ir_binop_bfm:
-   case ir_triop_bfi:
     case ir_quadop_vector:
     case ir_binop_vector_extract:
     case ir_triop_vector_insert:
@@ -2346,10 +2305,13 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
  
              decl->mesa_index = var->data.location;
              decl->array_id = num_input_arrays + 1;
-            if (is_2d)
+            if (is_2d) {
                 decl->array_size = type_size(var->type->fields.array);
-            else
+               decl->array_type = var->type->fields.array->without_array()->base_type;
+            } else {
                 decl->array_size = type_size(var->type);
+               decl->array_type = var->type->without_array()->base_type;
+            }
              num_input_arrays++;
  
              entry = new(mem_ctx) variable_storage(var,
@@ -2372,10 +2334,13 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
  
              decl->mesa_index = var->data.location;
              decl->array_id = num_output_arrays + 1;
-            if (is_2d)
+            if (is_2d) {
                 decl->array_size = type_size(var->type->fields.array);
-            else
+               decl->array_type = var->type->fields.array->without_array()->base_type;
+            } else {
                 decl->array_size = type_size(var->type);
+               decl->array_type = var->type->without_array()->base_type;
+            }
              num_output_arrays++;
  
              entry = new(mem_ctx) variable_storage(var,
@@ -2414,6 +2379,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
  
     this->result = st_src_reg(entry->file, entry->index, var->type);
     this->result.array_id = entry->array_id;
+   if (this->shader->Stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in && var->type->is_double())
+      this->result.is_double_vertex_input = true;
     if (!native_integers)
        this->result.type = GLSL_TYPE_FLOAT;
  }
@@ -2421,6 +2388,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
  static void
  shrink_array_declarations(struct array_decl *arrays, unsigned count,
                            GLbitfield64 usage_mask,
+                          GLbitfield64 double_usage_mask,
                            GLbitfield patch_usage_mask)
  {
     unsigned i, j;
@@ -2441,6 +2409,8 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
           else {
              if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
                 break;
+            if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
+               break;
           }
  
           decl->mesa_index++;
@@ -2458,6 +2428,8 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
           else {
              if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
                 break;
+            if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
+               break;
           }
  
           decl->array_size--;
@@ -2498,6 +2470,10 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
        element_size = 1;
  
     if (index) {
+
+      if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
+         src.file == PROGRAM_INPUT)
+        element_size = attrib_type_size(ir->type, true);
        if (is_2D) {
           src.index2D = index->value.i[0];
           src.has_index2 = true;
@@ -2749,7 +2725,7 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
     if (type->is_matrix()) {
        const struct glsl_type *vec_type;
  
-      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
+      vec_type = glsl_type::get_instance(type->is_double() ? GLSL_TYPE_DOUBLE : GLSL_TYPE_FLOAT,
                                           type->vector_elements, 1);
  
        for (int i = 0; i < type->matrix_columns; i++) {
@@ -2779,6 +2755,11 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
     }
     l->index++;
     r->index++;
+   if (type->is_dual_slot_double()) {
+      l->index++;
+      if (r->is_double_vertex_input == false)
+        r->index++;
+   }
  }
  
  void
@@ -2798,7 +2779,26 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
      */
     if (ir->write_mask == 0) {
        assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
-      l.writemask = WRITEMASK_XYZW;
+
+      if (ir->lhs->type->is_array() || ir->lhs->type->without_array()->is_matrix()) {
+         if (ir->lhs->type->without_array()->is_double()) {
+            switch (ir->lhs->type->without_array()->vector_elements) {
+            case 1:
+               l.writemask = WRITEMASK_X;
+               break;
+            case 2:
+               l.writemask = WRITEMASK_XY;
+               break;
+            case 3:
+               l.writemask = WRITEMASK_XYZ;
+               break;
+            case 4:
+               l.writemask = WRITEMASK_XYZW;
+               break;
+            }
+         } else
+            l.writemask = WRITEMASK_XYZW;
+      }
     } else if (ir->lhs->type->is_scalar() &&
                !ir->lhs->type->is_double() &&
                ir->lhs->variable_referenced()->data.mode == ir_var_shader_out) {
@@ -2932,20 +2932,57 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
        st_dst_reg mat_column = st_dst_reg(mat);
  
        for (i = 0; i < ir->type->matrix_columns; i++) {
-         assert(ir->type->base_type == GLSL_TYPE_FLOAT);
-         values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
-
-         src = st_src_reg(file, -1, ir->type->base_type);
-         src.index = add_constant(file,
-                                  values,
-                                  ir->type->vector_elements,
-                                  GL_FLOAT,
-                                  &src.swizzle);
-         emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+         switch (ir->type->base_type) {
+         case GLSL_TYPE_FLOAT:
+            values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
  
+            src = st_src_reg(file, -1, ir->type->base_type);
+            src.index = add_constant(file,
+                                     values,
+                                     ir->type->vector_elements,
+                                     GL_FLOAT,
+                                     &src.swizzle);
+            emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            break;
+         case GLSL_TYPE_DOUBLE:
+            values = (gl_constant_value *) &ir->value.d[i * ir->type->vector_elements];
+            src = st_src_reg(file, -1, ir->type->base_type);
+            src.index = add_constant(file,
+                                     values,
+                                     ir->type->vector_elements,
+                                     GL_DOUBLE,
+                                     &src.swizzle);
+            if (ir->type->vector_elements >= 2) {
+               mat_column.writemask = WRITEMASK_XY;
+               src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
+               emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            } else {
+               mat_column.writemask = WRITEMASK_X;
+               src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+               emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+            }
+            src.index++;
+            if (ir->type->vector_elements > 2) {
+               if (ir->type->vector_elements == 4) {
+                  mat_column.writemask = WRITEMASK_ZW;
+                  src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
+                  emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+               } else {
+                  mat_column.writemask = WRITEMASK_Z;
+                  src.swizzle = MAKE_SWIZZLE4(SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y);
+                  emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
+                  mat_column.writemask = WRITEMASK_XYZW;
+                  src.swizzle = SWIZZLE_XYZW;
+               }
+               mat_column.index++;
+            }
+            break;
+         default:
+            unreachable("Illegal matrix constant type.\n");
+            break;
+         }
           mat_column.index++;
        }
-
        this->result = mat;
        return;
     }
@@ -4411,6 +4448,8 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
     TGSI_SEMANTIC_INSTANCEID,
     TGSI_SEMANTIC_VERTEXID_NOBASE,
     TGSI_SEMANTIC_BASEVERTEX,
+   TGSI_SEMANTIC_BASEINSTANCE,
+   TGSI_SEMANTIC_DRAWID,
  
     /* Geometry shader
      */
@@ -4418,6 +4457,7 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
  
     /* Fragment shader
      */
+   TGSI_SEMANTIC_POSITION,
     TGSI_SEMANTIC_FACE,
     TGSI_SEMANTIC_SAMPLEID,
     TGSI_SEMANTIC_SAMPLEPOS,
@@ -4626,7 +4666,7 @@ src_register(struct st_translate *t, const st_src_reg *reg)
        if (!reg->array_id) {
           assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
           assert(t->inputs[t->inputMapping[index]].File != TGSI_FILE_NULL);
-         return t->inputs[t->inputMapping[index]];
+         return t->inputs[t->inputMapping[index] + double_reg2];
        }
        else {
           struct array_decl *decl = &t->input_arrays[reg->array_id-1];
@@ -4635,7 +4675,7 @@ src_register(struct st_translate *t, const st_src_reg *reg)
  
           assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
           assert(t->inputs[slot].ArrayID == reg->array_id);
-         return ureg_src_array_offset(t->inputs[slot], index - mesa_index);
+         return ureg_src_array_offset(t->inputs[slot], index + double_reg2 - mesa_index);
        }
  
     case PROGRAM_ADDRESS:
@@ -4864,10 +4904,11 @@ compile_tgsi_instruction(struct st_translate *t,
   * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
   */
  static void
-emit_wpos_adjustment( struct st_translate *t,
-                      int wpos_transform_const,
-                      boolean invert,
-                      GLfloat adjX, GLfloat adjY[2])
+emit_wpos_adjustment(struct gl_context *ctx,
+                     struct st_translate *t,
+                     int wpos_transform_const,
+                     boolean invert,
+                     GLfloat adjX, GLfloat adjY[2])
  {
     struct ureg_program *ureg = t->ureg;
  
@@ -4879,7 +4920,11 @@ emit_wpos_adjustment( struct st_translate *t,
      */
     struct ureg_src wpostrans = ureg_DECL_constant(ureg, wpos_transform_const);
     struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src wpos_input = t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src *wpos =
+      ctx->Const.GLSLFragCoordIsSysVal ?
+         &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
+         &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src wpos_input = *wpos;
  
     /* First, apply the coordinate shift: */
     if (adjX || adjY[0] || adjY[1]) {
@@ -4930,7 +4975,7 @@ emit_wpos_adjustment( struct st_translate *t,
  
     /* Use wpos_temp as position input from here on:
      */
-   t->inputs[t->inputMapping[VARYING_SLOT_POS]] = ureg_src(wpos_temp);
+   *wpos = ureg_src(wpos_temp);
  }
  
  
@@ -5039,7 +5084,7 @@ emit_wpos(struct st_context *st,
  
     /* we invert after adjustment so that we avoid the MOV to temporary,
      * and reuse the adjustment ADD instead */
-   emit_wpos_adjustment(t, wpos_transform_const, invert, adjX, adjY);
+   emit_wpos_adjustment(st->ctx, t, wpos_transform_const, invert, adjX, adjY);
  }
  
  /**
@@ -5329,11 +5374,13 @@ st_translate_program(
      */
     {
        GLbitfield sysInputs = proginfo->SystemValuesRead;
-      unsigned numSys = 0;
+
        for (i = 0; sysInputs; i++) {
           if (sysInputs & (1 << i)) {
              unsigned semName = _mesa_sysval_to_semantic[i];
-            t->systemValues[i] = ureg_DECL_system_value(ureg, numSys, semName, 0);
+
+            t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
+
              if (semName == TGSI_SEMANTIC_INSTANCEID ||
                  semName == TGSI_SEMANTIC_VERTEXID) {
                 /* From Gallium perspective, these system values are always
@@ -5354,7 +5401,12 @@ st_translate_program(
                    t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
                 }
              }
-            numSys++;
+
+            if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                semName == TGSI_SEMANTIC_POSITION)
+               emit_wpos(st_context(ctx), t, proginfo, ureg,
+                         program->wpos_transform_const);
+
              sysInputs &= ~(1 << i);
           }
        }
@@ -5634,14 +5686,15 @@ get_mesa_program(struct gl_context *ctx,
  
     do_set_program_inouts(shader->ir, prog, shader->Stage);
     shrink_array_declarations(v->input_arrays, v->num_input_arrays,
-                             prog->InputsRead, prog->PatchInputsRead);
+                             prog->InputsRead, prog->DoubleInputsRead, prog->PatchInputsRead);
     shrink_array_declarations(v->output_arrays, v->num_output_arrays,
-                             prog->OutputsWritten, prog->PatchOutputsWritten);
+                             prog->OutputsWritten, 0ULL, prog->PatchOutputsWritten);
     count_resources(v, prog);
  
     /* This must be done before the uniform storage is associated. */
     if (shader->Type == GL_FRAGMENT_SHADER &&
-       prog->InputsRead & VARYING_BIT_POS){
+       (prog->InputsRead & VARYING_BIT_POS ||
+        prog->SystemValuesRead & (1 << SYSTEM_VALUE_FRAG_COORD))) {
        static const gl_state_index wposTransformState[STATE_LENGTH] = {
           STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM
        };
@@ -5652,6 +5705,12 @@ get_mesa_program(struct gl_context *ctx,
  
     _mesa_reference_program(ctx, &shader->Program, prog);
  
+   /* Avoid reallocation of the program parameter list, because the uniform
+    * storage is only associated with the original parameter list.
+    * This should be enough for Bitmap and DrawPixels constants.
+    */
+   _mesa_reserve_parameter_storage(prog->Parameters, 8);
+
     /* This has to be done last.  Any operation the can cause
      * prog->ParameterValues to get reallocated (e.g., anything that adds a
      * program constant) has to happen before creating this linkage.
@@ -5811,13 +5870,14 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                                 LOWER_PACK_SNORM_4x8 |
                                 LOWER_UNPACK_SNORM_4x8 |
                                 LOWER_UNPACK_UNORM_4x8 |
-                               LOWER_PACK_UNORM_4x8 |
-                               LOWER_PACK_HALF_2x16 |
-                               LOWER_UNPACK_HALF_2x16;
+                               LOWER_PACK_UNORM_4x8;
  
           if (ctx->Extensions.ARB_gpu_shader5)
              lower_inst |= LOWER_PACK_USE_BFI |
                            LOWER_PACK_USE_BFE;
+         if (!ctx->st->has_half_float_packing)
+            lower_inst |= LOWER_PACK_HALF_2x16 |
+                          LOWER_UNPACK_HALF_2x16;
  
           lower_packing_builtins(ir, lower_inst);
        }
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c

index d0d261f..385e26b 100644 (file)
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -39,6 +39,7 @@
  #include "st_texture.h"
  
  #include "st_context.h"
+#include "st_debug.h"
  #include "st_extensions.h"
  #include "st_format.h"
  #include "st_cb_fbo.h"
@@ -623,58 +624,6 @@ st_context_destroy(struct st_context_iface *stctxi)
     st_destroy_context(st);
  }
  
-static void
-st_debug_message(void *data,
-                 unsigned *id,
-                 enum pipe_debug_type ptype,
-                 const char *fmt,
-                 va_list args)
-{
-   struct st_context *st = data;
-   enum mesa_debug_source source;
-   enum mesa_debug_type type;
-   enum mesa_debug_severity severity;
-
-   switch (ptype) {
-   case PIPE_DEBUG_TYPE_OUT_OF_MEMORY:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_ERROR;
-      severity = MESA_DEBUG_SEVERITY_MEDIUM;
-      break;
-   case PIPE_DEBUG_TYPE_ERROR:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_ERROR;
-      severity = MESA_DEBUG_SEVERITY_MEDIUM;
-      break;
-   case PIPE_DEBUG_TYPE_SHADER_INFO:
-      source = MESA_DEBUG_SOURCE_SHADER_COMPILER;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_PERF_INFO:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_PERFORMANCE;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_INFO:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_FALLBACK:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_PERFORMANCE;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   case PIPE_DEBUG_TYPE_CONFORMANCE:
-      source = MESA_DEBUG_SOURCE_API;
-      type = MESA_DEBUG_TYPE_OTHER;
-      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
-      break;
-   }
-   _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args);
-}
-
  static struct st_context_iface *
  st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
                        const struct st_context_attribs *attribs,
@@ -723,17 +672,15 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
        return NULL;
     }
  
-   if (attribs->flags & ST_CONTEXT_FLAG_DEBUG){
+   if (attribs->flags & ST_CONTEXT_FLAG_DEBUG) {
        if (!_mesa_set_debug_state_int(st->ctx, GL_DEBUG_OUTPUT, GL_TRUE)) {
           *error = ST_CONTEXT_ERROR_NO_MEMORY;
           return NULL;
        }
+
        st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_DEBUG_BIT;
  
-      if (pipe->set_debug_callback) {
-         struct pipe_debug_callback cb = { st_debug_message, st };
-         pipe->set_debug_callback(pipe, &cb);
-      }
+      st_enable_debug_output(st, TRUE);
     }
  
     if (attribs->flags & ST_CONTEXT_FLAG_FORWARD_COMPATIBLE)
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c

index 4b9dc99..be47823 100644 (file)
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -475,24 +475,6 @@ static void emit_swz( struct st_translate *t,
  }
  
  
-/**
- * Negate the value of DDY to match GL semantics where (0,0) is the
- * lower-left corner of the window.
- * Note that the GL_ARB_fragment_coord_conventions extension will
- * effect this someday.
- */
-static void emit_ddy( struct st_translate *t,
-                      struct ureg_dst dst,
-                      const struct prog_src_register *SrcReg )
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_src src = translate_src( t, SrcReg );
-   src = ureg_negate( src );
-   ureg_DDY( ureg, dst, src );
-}
-
-
-
  static unsigned
  translate_opcode( unsigned op )
  {
@@ -714,10 +696,6 @@ compile_instruction(
         */
        ureg_MOV( ureg, dst[0], ureg_imm1f(ureg, 0.5) );
        break;
-                
-   case OPCODE_DDY:
-      emit_ddy( t, dst[0], &inst->SrcReg[0] );
-      break;
  
     case OPCODE_RSQ:
        ureg_RSQ( ureg, dst[0], ureg_abs(src[0]) );
@@ -739,10 +717,11 @@ compile_instruction(
   * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
   */
  static void
-emit_wpos_adjustment( struct st_translate *t,
-                      const struct gl_program *program,
-                      boolean invert,
-                      GLfloat adjX, GLfloat adjY[2])
+emit_wpos_adjustment(struct gl_context *ctx,
+                     struct st_translate *t,
+                     const struct gl_program *program,
+                     boolean invert,
+                     GLfloat adjX, GLfloat adjY[2])
  {
     struct ureg_program *ureg = t->ureg;
  
@@ -762,7 +741,11 @@ emit_wpos_adjustment( struct st_translate *t,
  
     struct ureg_src wpostrans = ureg_DECL_constant( ureg, wposTransConst );
     struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src wpos_input = t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src *wpos =
+      ctx->Const.GLSLFragCoordIsSysVal ?
+         &t->systemValues[SYSTEM_VALUE_FRAG_COORD] :
+         &t->inputs[t->inputMapping[VARYING_SLOT_POS]];
+   struct ureg_src wpos_input = *wpos;
  
     /* First, apply the coordinate shift: */
     if (adjX || adjY[0] || adjY[1]) {
@@ -813,7 +796,7 @@ emit_wpos_adjustment( struct st_translate *t,
  
     /* Use wpos_temp as position input from here on:
      */
-   t->inputs[t->inputMapping[VARYING_SLOT_POS]] = ureg_src(wpos_temp);
+   *wpos = ureg_src(wpos_temp);
  }
  
  
@@ -921,32 +904,7 @@ emit_wpos(struct st_context *st,
  
     /* we invert after adjustment so that we avoid the MOV to temporary,
      * and reuse the adjustment ADD instead */
-   emit_wpos_adjustment(t, program, invert, adjX, adjY);
-}
-
-
-/**
- * OpenGL's fragment gl_FrontFace input is 1 for front-facing, 0 for back.
- * TGSI uses +1 for front, -1 for back.
- * This function converts the TGSI value to the GL value.  Simply clamping/
- * saturating the value to [0,1] does the job.
- */
-static void
-emit_face_var( struct st_translate *t,
-               const struct gl_program *program )
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_dst face_temp = ureg_DECL_temporary( ureg );
-   struct ureg_src face_input = t->inputs[t->inputMapping[VARYING_SLOT_FACE]];
-
-   /* MOV_SAT face_temp, input[face]
-    */
-   face_temp = ureg_saturate( face_temp );
-   ureg_MOV( ureg, face_temp, face_input );
-
-   /* Use face_temp as face input from here on:
-    */
-   t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
+   emit_wpos_adjustment(st->ctx, t, program, invert, adjX, adjY);
  }
  
  
@@ -1020,10 +978,6 @@ st_translate_mesa_program(
           emit_wpos(st_context(ctx), t, program, ureg);
        }
  
-      if (program->InputsRead & VARYING_BIT_FACE) {
-         emit_face_var( t, program );
-      }
-
        /*
         * Declare output attributes.
         */
@@ -1100,11 +1054,13 @@ st_translate_mesa_program(
      */
     {
        GLbitfield sysInputs = program->SystemValuesRead;
-      unsigned numSys = 0;
+
        for (i = 0; sysInputs; i++) {
           if (sysInputs & (1 << i)) {
              unsigned semName = _mesa_sysval_to_semantic[i];
-            t->systemValues[i] = ureg_DECL_system_value(ureg, numSys, semName, 0);
+
+            t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
+
              if (semName == TGSI_SEMANTIC_INSTANCEID ||
                  semName == TGSI_SEMANTIC_VERTEXID) {
                 /* From Gallium perspective, these system values are always
@@ -1125,7 +1081,11 @@ st_translate_mesa_program(
                    t->systemValues[i] = ureg_scalar(ureg_src(temp), 0);
                 }
              }
-            numSys++;
+
+            if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                semName == TGSI_SEMANTIC_POSITION)
+               emit_wpos(st_context(ctx), t, program, ureg);
+
              sysInputs &= ~(1 << i);
           }
        }
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c

index 75ccaf2..b395454 100644 (file)
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -112,8 +112,6 @@ delete_fp_variant(struct st_context *st, struct st_fp_variant *fpv)
  {
     if (fpv->driver_shader) 
        cso_delete_fragment_shader(st->cso_context, fpv->driver_shader);
-   if (fpv->parameters)
-      _mesa_free_parameter_list(fpv->parameters);
     free(fpv);
  }
  
@@ -583,8 +581,11 @@ st_translate_fragment_program(struct st_context *st,
  
     memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr));
  
-   if (!stfp->glsl_to_tgsi)
+   if (!stfp->glsl_to_tgsi) {
        _mesa_remove_output_reads(&stfp->Base.Base, PROGRAM_OUTPUT);
+      if (st->ctx->Const.GLSLFragCoordIsSysVal)
+         _mesa_program_fragment_position_to_sysval(&stfp->Base.Base);
+   }
  
     /*
      * Convert Mesa program inputs to TGSI input register semantics.
@@ -914,8 +915,6 @@ st_create_fp_variant(struct st_context *st,
           if (tgsi.tokens != stfp->tgsi.tokens)
              tgsi_free_tokens(tgsi.tokens);
           tgsi.tokens = tokens;
-         variant->parameters =
-            _mesa_clone_parameter_list(stfp->Base.Base.Parameters);
        } else
           fprintf(stderr, "mesa: cannot create a shader for glBitmap\n");
     }
@@ -924,6 +923,7 @@ st_create_fp_variant(struct st_context *st,
     if (key->drawpixels) {
        const struct tgsi_token *tokens;
        unsigned scale_const = 0, bias_const = 0, texcoord_const = 0;
+      struct gl_program_parameter_list *params = stfp->Base.Base.Parameters;
  
        /* Find the first unused slot. */
        variant->drawpix_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1;
@@ -935,27 +935,21 @@ st_create_fp_variant(struct st_context *st,
           variant->pixelmap_sampler = ffs(~samplers_used) - 1;
        }
  
-      variant->parameters =
-         _mesa_clone_parameter_list(stfp->Base.Base.Parameters);
-
        if (key->scaleAndBias) {
           static const gl_state_index scale_state[STATE_LENGTH] =
              { STATE_INTERNAL, STATE_PT_SCALE };
           static const gl_state_index bias_state[STATE_LENGTH] =
              { STATE_INTERNAL, STATE_PT_BIAS };
  
-         scale_const = _mesa_add_state_reference(variant->parameters,
-                                                 scale_state);
-         bias_const = _mesa_add_state_reference(variant->parameters,
-                                                bias_state);
+         scale_const = _mesa_add_state_reference(params, scale_state);
+         bias_const = _mesa_add_state_reference(params, bias_state);
        }
  
        {
           static const gl_state_index state[STATE_LENGTH] =
              { STATE_INTERNAL, STATE_CURRENT_ATTRIB, VERT_ATTRIB_TEX0 };
  
-         texcoord_const = _mesa_add_state_reference(variant->parameters,
-                                                    state);
+         texcoord_const = _mesa_add_state_reference(params, state);
        }
  
        tokens = st_get_drawpix_shader(tgsi.tokens,
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h

index d9b53ac..a745315 100644 (file)
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -80,7 +80,6 @@ struct st_fp_variant
     void *driver_shader;
  
     /** For glBitmap variants */
-   struct gl_program_parameter_list *parameters;
     uint bitmap_sampler;
  
     /** For glDrawPixels variants */
@@ -406,12 +405,12 @@ st_get_gp_variant(struct st_context *st,
  
  extern struct st_tcp_variant *
  st_get_tcp_variant(struct st_context *st,
-                   struct st_tessctrl_program *stgp,
+                   struct st_tessctrl_program *sttcp,
                     const struct st_tcp_variant_key *key);
  
  extern struct st_tep_variant *
  st_get_tep_variant(struct st_context *st,
-                   struct st_tesseval_program *stgp,
+                   struct st_tesseval_program *sttep,
                     const struct st_tep_variant_key *key);
  
  extern void
@@ -428,11 +427,11 @@ st_release_gp_variants(struct st_context *st,
  
  extern void
  st_release_tcp_variants(struct st_context *st,
-                        struct st_tessctrl_program *stgp);
+                        struct st_tessctrl_program *sttcp);
  
  extern void
  st_release_tep_variants(struct st_context *st,
-                        struct st_tesseval_program *stgp);
+                        struct st_tesseval_program *sttep);
  
  extern void
  st_destroy_program_variants(struct st_context *st);
diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c

index 2974dee..414a414 100644 (file)
--- a/src/mesa/swrast/s_atifragshader.c
+++ b/src/mesa/swrast/s_atifragshader.c
@@ -26,6 +26,8 @@
  #include "swrast/s_atifragshader.h"
  #include "swrast/s_context.h"
  
+#define ATI_FS_INPUT_PRIMARY 0
+#define ATI_FS_INPUT_SECONDARY 1
  
  /**
   * State for executing ATI fragment shader.
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h

index 00e843c..0b8b6a9 100644 (file)
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -58,6 +58,7 @@ struct _mesa_prim {
     GLint basevertex;
     GLuint num_instances;
     GLuint base_instance;
+   GLuint draw_id;
  
     GLsizeiptr indirect_offset;
  };
@@ -77,7 +78,7 @@ struct _mesa_index_buffer {
  
  GLboolean _vbo_CreateContext( struct gl_context *ctx );
  void _vbo_DestroyContext( struct gl_context *ctx );
-void _vbo_InvalidateState( struct gl_context *ctx, GLuint new_state );
+void _vbo_InvalidateState( struct gl_context *ctx, GLbitfield new_state );
  
  
  void
@@ -109,6 +110,18 @@ typedef void (*vbo_draw_func)( struct gl_context *ctx,
                                struct gl_buffer_object *indirect);
  
  
+typedef void (*vbo_indirect_draw_func)(
+   struct gl_context *ctx,
+   GLuint mode,
+   struct gl_buffer_object *indirect_data,
+   GLsizeiptr indirect_offset,
+   unsigned draw_count,
+   unsigned stride,
+   struct gl_buffer_object *indirect_params,
+   GLsizeiptr indirect_params_offset,
+   const struct _mesa_index_buffer *ib);
+
+
  
  
  /* Utility function to cope with various constraints on tnl modules or
@@ -178,6 +191,9 @@ void vbo_always_unmap_buffers(struct gl_context *ctx);
  
  void vbo_set_draw_func(struct gl_context *ctx, vbo_draw_func func);
  
+void vbo_set_indirect_draw_func(struct gl_context *ctx,
+                                vbo_indirect_draw_func func);
+
  void vbo_check_buffers_are_unmapped(struct gl_context *ctx);
  
  void vbo_bind_arrays(struct gl_context *ctx);
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c

index 5e1a760..9f807a1 100644 (file)
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -135,6 +135,48 @@ static void init_mat_currval(struct gl_context *ctx)
     }
  }
  
+static void
+vbo_draw_indirect_prims(struct gl_context *ctx,
+                        GLuint mode,
+                        struct gl_buffer_object *indirect_data,
+                        GLsizeiptr indirect_offset,
+                        unsigned draw_count,
+                        unsigned stride,
+                        struct gl_buffer_object *indirect_params,
+                        GLsizeiptr indirect_params_offset,
+                        const struct _mesa_index_buffer *ib)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct _mesa_prim *prim;
+   GLsizei i;
+
+   prim = calloc(draw_count, sizeof(*prim));
+   if (prim == NULL) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sDraw%sIndirect%s",
+                  (draw_count > 1) ? "Multi" : "",
+                  ib ? "Elements" : "Arrays",
+                  indirect_params ? "CountARB" : "");
+      return;
+   }
+
+   prim[0].begin = 1;
+   prim[draw_count - 1].end = 1;
+   for (i = 0; i < draw_count; ++i, indirect_offset += stride) {
+      prim[i].mode = mode;
+      prim[i].indexed = !!ib;
+      prim[i].indirect_offset = indirect_offset;
+      prim[i].is_indirect = 1;
+      prim[i].draw_id = i;
+   }
+
+   vbo->draw_prims(ctx, prim, draw_count,
+                   ib, GL_TRUE, 0, ~0,
+                   NULL, 0,
+                   ctx->DrawIndirectBuffer);
+
+   free(prim);
+}
+
  
  GLboolean _vbo_CreateContext( struct gl_context *ctx )
  {
@@ -152,6 +194,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
     init_legacy_currval( ctx );
     init_generic_currval( ctx );
     init_mat_currval( ctx );
+   vbo_set_indirect_draw_func(ctx, vbo_draw_indirect_prims);
  
     /* Build mappings from VERT_ATTRIB -> VBO_ATTRIB depending on type
      * of vertex program active.
@@ -186,7 +229,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx )
  }
  
  
-void _vbo_InvalidateState( struct gl_context *ctx, GLuint new_state )
+void _vbo_InvalidateState( struct gl_context *ctx, GLbitfield new_state )
  {
     vbo_exec_invalidate_state(ctx, new_state);
  }
@@ -223,3 +266,10 @@ void vbo_set_draw_func(struct gl_context *ctx, vbo_draw_func func)
     vbo->draw_prims = func;
  }
  
+
+void vbo_set_indirect_draw_func(struct gl_context *ctx,
+                                vbo_indirect_draw_func func)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   vbo->draw_indirect_prims = func;
+}
diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h

index 6293a8b..11f9b17 100644 (file)
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -76,6 +76,12 @@ struct vbo_context {
      * is responsible for initiating any fallback actions required:
      */
     vbo_draw_func draw_prims;
+
+   /* Optional callback for indirect draws. This allows multidraws to not be
+    * broken up, as well as for the actual count to be passed in as a separate
+    * indirect parameter.
+    */
+   vbo_indirect_draw_func draw_indirect_prims;
  };
  
  
diff --git a/src/mesa/vbo/vbo_exec.c b/src/mesa/vbo/vbo_exec.c

index a301c6c..4db4f40 100644 (file)
--- a/src/mesa/vbo/vbo_exec.c
+++ b/src/mesa/vbo/vbo_exec.c
@@ -73,7 +73,7 @@ void vbo_exec_destroy( struct gl_context *ctx )
   * invoked according to the state flags.  That will have to wait for a
   * mesa rework:
   */ 
-void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state )
+void vbo_exec_invalidate_state( struct gl_context *ctx, GLbitfield new_state )
  {
     struct vbo_context *vbo = vbo_context(ctx);
     struct vbo_exec_context *exec = &vbo->exec;
diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h

index a80b2c9..27bff4a 100644 (file)
--- a/src/mesa/vbo/vbo_exec.h
+++ b/src/mesa/vbo/vbo_exec.h
@@ -146,7 +146,7 @@ struct vbo_exec_context
   */
  void vbo_exec_init( struct gl_context *ctx );
  void vbo_exec_destroy( struct gl_context *ctx );
-void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state );
+void vbo_exec_invalidate_state( struct gl_context *ctx, GLbitfield new_state );
  
  
  /* Internal functions:
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c

index e27fdd9..02139ef 100644 (file)
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -1341,6 +1341,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
          prim[i].indexed = 1;
           prim[i].num_instances = 1;
           prim[i].base_instance = 0;
+         prim[i].draw_id = i;
           prim[i].is_indirect = 0;
          if (basevertex != NULL)
             prim[i].basevertex = basevertex[i];
@@ -1371,6 +1372,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
          prim[0].indexed = 1;
           prim[0].num_instances = 1;
           prim[0].base_instance = 0;
+         prim[0].draw_id = i;
           prim[0].is_indirect = 0;
          if (basevertex != NULL)
             prim[0].basevertex = basevertex[i];
@@ -1544,27 +1546,14 @@ vbo_validated_drawarraysindirect(struct gl_context *ctx,
  {
     struct vbo_context *vbo = vbo_context(ctx);
     struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_prim prim[1];
  
     vbo_bind_arrays(ctx);
  
-   memset(prim, 0, sizeof(prim));
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].mode = mode;
-   prim[0].is_indirect = 1;
-   prim[0].indirect_offset = (GLsizeiptr)indirect;
-
-   /* NOTE: We do NOT want to handle primitive restart here, nor perform any
-    * other checks that require knowledge of the values in the command buffer.
-    * That would defeat the whole purpose of this function.
-    */
-
     check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, 1,
-                   NULL, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, (GLsizeiptr)indirect,
+                            1 /* draw_count */, 16 /* stride */,
+                            NULL, 0, NULL);
  
     if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
        _mesa_flush(ctx);
@@ -1578,35 +1567,18 @@ vbo_validated_multidrawarraysindirect(struct gl_context *ctx,
  {
     struct vbo_context *vbo = vbo_context(ctx);
     struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_prim *prim;
-   GLsizei i;
     GLsizeiptr offset = (GLsizeiptr)indirect;
  
     if (primcount == 0)
        return;
-   prim = calloc(primcount, sizeof(*prim));
-   if (prim == NULL) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glMultiDrawArraysIndirect");
-      return;
-   }
  
     vbo_bind_arrays(ctx);
  
-   prim[0].begin = 1;
-   prim[primcount - 1].end = 1;
-   for (i = 0; i < primcount; ++i, offset += stride) {
-      prim[i].mode = mode;
-      prim[i].indirect_offset = offset;
-      prim[i].is_indirect = 1;
-   }
-
     check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, primcount,
-                   NULL, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
-
-   free(prim);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            primcount, stride,
+                            NULL, 0, NULL);
  
     if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
        _mesa_flush(ctx);
@@ -1620,7 +1592,6 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
     struct vbo_context *vbo = vbo_context(ctx);
     struct vbo_exec_context *exec = &vbo->exec;
     struct _mesa_index_buffer ib;
-   struct _mesa_prim prim[1];
  
     vbo_bind_arrays(ctx);
  
@@ -1629,19 +1600,12 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
     ib.obj = ctx->Array.VAO->IndexBufferObj;
     ib.ptr = NULL;
  
-   memset(prim, 0, sizeof(prim));
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].mode = mode;
-   prim[0].indexed = 1;
-   prim[0].indirect_offset = (GLsizeiptr)indirect;
-   prim[0].is_indirect = 1;
-
     check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, 1,
-                   &ib, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, (GLsizeiptr)indirect,
+                            1 /* draw_count */, 20 /* stride */,
+                            NULL, 0,
+                            &ib);
  
     if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
        _mesa_flush(ctx);
@@ -1656,17 +1620,10 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
     struct vbo_context *vbo = vbo_context(ctx);
     struct vbo_exec_context *exec = &vbo->exec;
     struct _mesa_index_buffer ib;
-   struct _mesa_prim *prim;
-   GLsizei i;
     GLsizeiptr offset = (GLsizeiptr)indirect;
  
     if (primcount == 0)
        return;
-   prim = calloc(primcount, sizeof(*prim));
-   if (prim == NULL) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glMultiDrawElementsIndirect");
-      return;
-   }
  
     vbo_bind_arrays(ctx);
  
@@ -1677,22 +1634,12 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
     ib.obj = ctx->Array.VAO->IndexBufferObj;
     ib.ptr = NULL;
  
-   prim[0].begin = 1;
-   prim[primcount - 1].end = 1;
-   for (i = 0; i < primcount; ++i, offset += stride) {
-      prim[i].mode = mode;
-      prim[i].indexed = 1;
-      prim[i].indirect_offset = offset;
-      prim[i].is_indirect = 1;
-   }
-
     check_buffers_are_unmapped(exec->array.inputs);
-   vbo->draw_prims(ctx, prim, primcount,
-                   &ib, GL_TRUE, 0, ~0,
-                   NULL, 0,
-                   ctx->DrawIndirectBuffer);
-
-   free(prim);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            primcount, stride,
+                            NULL, 0,
+                            &ib);
  
     if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
        _mesa_flush(ctx);
@@ -1785,6 +1732,128 @@ vbo_exec_MultiDrawElementsIndirect(GLenum mode, GLenum type,
                                             primcount, stride);
  }
  
+static void
+vbo_validated_multidrawarraysindirectcount(struct gl_context *ctx,
+                                           GLenum mode,
+                                           GLintptr indirect,
+                                           GLintptr drawcount,
+                                           GLsizei maxdrawcount,
+                                           GLsizei stride)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   GLsizeiptr offset = indirect;
+
+   if (maxdrawcount == 0)
+      return;
+
+   vbo_bind_arrays(ctx);
+
+   check_buffers_are_unmapped(exec->array.inputs);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            maxdrawcount, stride,
+                            ctx->ParameterBuffer, drawcount,
+                            NULL);
+
+   if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
+      _mesa_flush(ctx);
+}
+
+static void
+vbo_validated_multidrawelementsindirectcount(struct gl_context *ctx,
+                                             GLenum mode, GLenum type,
+                                             GLintptr indirect,
+                                             GLintptr drawcount,
+                                             GLsizei maxdrawcount,
+                                             GLsizei stride)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   struct _mesa_index_buffer ib;
+   GLsizeiptr offset = (GLsizeiptr)indirect;
+
+   if (maxdrawcount == 0)
+      return;
+
+   vbo_bind_arrays(ctx);
+
+   /* NOTE: IndexBufferObj is guaranteed to be a VBO. */
+
+   ib.count = 0; /* unknown */
+   ib.type = type;
+   ib.obj = ctx->Array.VAO->IndexBufferObj;
+   ib.ptr = NULL;
+
+   check_buffers_are_unmapped(exec->array.inputs);
+   vbo->draw_indirect_prims(ctx, mode,
+                            ctx->DrawIndirectBuffer, offset,
+                            maxdrawcount, stride,
+                            ctx->ParameterBuffer, drawcount,
+                            &ib);
+
+   if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
+      _mesa_flush(ctx);
+}
+
+static void GLAPIENTRY
+vbo_exec_MultiDrawArraysIndirectCount(GLenum mode,
+                                      GLintptr indirect,
+                                      GLintptr drawcount,
+                                      GLsizei maxdrawcount, GLsizei stride)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glMultiDrawArraysIndirectCountARB"
+                  "(%s, %lx, %lx, %i, %i)\n",
+                  _mesa_enum_to_string(mode), indirect,
+                  drawcount, maxdrawcount, stride);
+
+   /* If <stride> is zero, the array elements are treated as tightly packed. */
+   if (stride == 0)
+      stride = 4 * sizeof(GLuint); /* sizeof(DrawArraysIndirectCommand) */
+
+   if (!_mesa_validate_MultiDrawArraysIndirectCount(ctx, mode,
+                                                    indirect, drawcount,
+                                                    maxdrawcount, stride))
+      return;
+
+   vbo_validated_multidrawarraysindirectcount(ctx, mode,
+                                              indirect, drawcount,
+                                              maxdrawcount, stride);
+}
+
+static void GLAPIENTRY
+vbo_exec_MultiDrawElementsIndirectCount(GLenum mode, GLenum type,
+                                        GLintptr indirect,
+                                        GLintptr drawcount,
+                                        GLsizei maxdrawcount, GLsizei stride)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glMultiDrawElementsIndirectCountARB"
+                  "(%s, %s, %lx, %lx, %i, %i)\n",
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect,
+                  drawcount, maxdrawcount, stride);
+
+   /* If <stride> is zero, the array elements are treated as tightly packed. */
+   if (stride == 0)
+      stride = 5 * sizeof(GLuint); /* sizeof(DrawElementsIndirectCommand) */
+
+   if (!_mesa_validate_MultiDrawElementsIndirectCount(ctx, mode, type,
+                                                      indirect, drawcount,
+                                                      maxdrawcount, stride))
+      return;
+
+   vbo_validated_multidrawelementsindirectcount(ctx, mode, type,
+                                                indirect, drawcount,
+                                                maxdrawcount, stride);
+}
+
+
  /**
   * Initialize the dispatch table with the VBO functions for drawing.
   */
@@ -1832,6 +1901,8 @@ vbo_initialize_exec_dispatch(const struct gl_context *ctx,
     if (ctx->API == API_OPENGL_CORE) {
        SET_MultiDrawArraysIndirect(exec, vbo_exec_MultiDrawArraysIndirect);
        SET_MultiDrawElementsIndirect(exec, vbo_exec_MultiDrawElementsIndirect);
+      SET_MultiDrawArraysIndirectCountARB(exec, vbo_exec_MultiDrawArraysIndirectCount);
+      SET_MultiDrawElementsIndirectCountARB(exec, vbo_exec_MultiDrawElementsIndirectCount);
     }
  
     if (_mesa_is_desktop_gl(ctx) || _mesa_is_gles3(ctx)) {
diff --git a/src/util/bitset.h b/src/util/bitset.h

index c452819..2404ce7 100644 (file)
--- a/src/util/bitset.h
+++ b/src/util/bitset.h
@@ -98,7 +98,7 @@ __bitset_ffs(const BITSET_WORD *x, int n)
  
  static inline unsigned
  __bitset_next_set(unsigned i, BITSET_WORD *tmp,
-                  BITSET_WORD *set, unsigned size)
+                  const BITSET_WORD *set, unsigned size)
  {
     unsigned bit, word;
  
diff --git a/src/util/list.h b/src/util/list.h

index 066f9b8..f0dec5d 100644 (file)
--- a/src/util/list.h
+++ b/src/util/list.h
@@ -116,6 +116,28 @@ static inline unsigned list_length(struct list_head *list)
     return length;
  }
  
+static inline void list_splice(struct list_head *src, struct list_head *dst)
+{
+   if (list_empty(src))
+      return;
+
+   src->next->prev = dst;
+   src->prev->next = dst->next;
+   dst->next->prev = src->prev;
+   dst->next = src->next;
+}
+
+static inline void list_splicetail(struct list_head *src, struct list_head *dst)
+{
+   if (list_empty(src))
+      return;
+
+   src->prev->next = dst;
+   src->next->prev = dst->prev;
+   dst->prev->next = src->next;
+   dst->prev = src->prev;
+}
+
  static inline void list_validate(struct list_head *list)
  {
     struct list_head *node;
diff --git a/src/util/ralloc.c b/src/util/ralloc.c

index bb4cf96..6d4032b 100644 (file)
--- a/src/util/ralloc.c
+++ b/src/util/ralloc.c
@@ -293,6 +293,7 @@ ralloc_adopt(const void *new_ctx, void *old_ctx)
  
     /* Connect the two lists together; parent them to new_ctx; make old_ctx empty. */
     child->next = new_info->child;
+   child->parent = new_info;
     new_info->child = old_info->child;
     old_info->child = NULL;
  }
diff --git a/src/vulkan/.gitignore b/src/vulkan/.gitignore

new file mode 100644 (file)

index 0000000..8f9477c
--- /dev/null
+++ b/src/vulkan/.gitignore
@@ -0,0 +1,7 @@
+# Generated source files
+/*_spirv_autogen.h
+/anv_entrypoints.c
+/anv_entrypoints.h
+/wayland-drm-protocol.c
+/wayland-drm-client-protocol.h
+/anv_icd.json
diff --git a/src/vulkan/Makefile.am b/src/vulkan/Makefile.am

new file mode 100644 (file)

index 0000000..efb781e
--- /dev/null
+++ b/src/vulkan/Makefile.am
@@ -0,0 +1,175 @@
+# Copyright © 2015 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+SUBDIRS = . tests
+
+vulkan_includedir = $(includedir)/vulkan
+
+vulkan_include_HEADERS =                               \
+       $(top_srcdir)/include/vulkan/vk_platform.h      \
+       $(top_srcdir)/include/vulkan/vulkan.h           \
+       $(top_srcdir)/include/vulkan/vulkan_intel.h
+
+# Used when generating entrypoints to filter out unwanted extensions
+VULKAN_ENTRYPOINT_CPPFLAGS = \
+   -I$(top_srcdir)/include/vulkan \
+   -DVK_USE_PLATFORM_XCB_KHR \
+   -DVK_USE_PLATFORM_WAYLAND_KHR
+
+lib_LTLIBRARIES = libvulkan.la
+
+check_LTLIBRARIES = libvulkan-test.la
+
+PER_GEN_LIBS = \
+   libanv-gen7.la \
+   libanv-gen75.la \
+   libanv-gen8.la \
+   libanv-gen9.la
+
+noinst_LTLIBRARIES = $(PER_GEN_LIBS)
+
+# The gallium includes are for the util/u_math.h include from main/macros.h
+
+AM_CPPFLAGS = \
+       $(INTEL_CFLAGS) \
+       $(VALGRIND_CFLAGS) \
+       $(DEFINES) \
+       -I$(top_srcdir)/include \
+       -I$(top_srcdir)/src \
+       -I$(top_srcdir)/src/glsl/nir \
+       -I$(top_srcdir)/src/mapi \
+       -I$(top_srcdir)/src/mesa \
+       -I$(top_srcdir)/src/mesa/drivers/dri/common \
+       -I$(top_srcdir)/src/mesa/drivers/dri/i965 \
+       -I$(top_srcdir)/src/gallium/auxiliary \
+       -I$(top_srcdir)/src/gallium/include \
+       -I$(top_srcdir)/src/isl/ \
+       -I$(top_builddir)/src \
+       -I$(top_builddir)/src/glsl/nir \
+       -I$(top_builddir)/src/vulkan
+
+libvulkan_la_CFLAGS = $(CFLAGS) -Wno-override-init
+
+VULKAN_SOURCES =                                        \
+       anv_allocator.c                                 \
+       anv_cmd_buffer.c                                \
+       anv_batch_chain.c                               \
+       anv_descriptor_set.c                            \
+       anv_device.c                                    \
+        anv_dump.c                                      \
+       anv_entrypoints.c                               \
+       anv_entrypoints.h                               \
+       anv_formats.c                                   \
+       anv_image.c                                     \
+       anv_intel.c                                     \
+       anv_meta.c                                      \
+       anv_meta_clear.c                                \
+       anv_nir_apply_dynamic_offsets.c                 \
+       anv_nir_apply_pipeline_layout.c                 \
+       anv_nir_lower_push_constants.c                  \
+       anv_pass.c                                      \
+       anv_pipeline.c                                  \
+       anv_private.h                                   \
+       anv_query.c                                     \
+       anv_util.c                                      \
+       anv_wsi.c                                       \
+       anv_wsi_x11.c
+
+BUILT_SOURCES =                                         \
+       anv_entrypoints.h                               \
+       anv_entrypoints.c
+
+libanv_gen7_la_SOURCES =                                \
+       genX_cmd_buffer.c                               \
+       gen7_cmd_buffer.c                               \
+       gen7_pipeline.c                                 \
+       gen7_state.c
+libanv_gen7_la_CFLAGS = $(libvulkan_la_CFLAGS) -DANV_GENx10=70
+
+libanv_gen75_la_SOURCES =                               \
+       genX_cmd_buffer.c                               \
+       gen7_cmd_buffer.c                               \
+       gen7_pipeline.c                                 \
+       gen7_state.c
+libanv_gen75_la_CFLAGS = $(libvulkan_la_CFLAGS) -DANV_GENx10=75
+
+libanv_gen8_la_SOURCES =                                       \
+       genX_cmd_buffer.c                               \
+       gen8_cmd_buffer.c                               \
+       gen8_pipeline.c                                 \
+       gen8_state.c
+libanv_gen8_la_CFLAGS = $(libvulkan_la_CFLAGS) -DANV_GENx10=80
+
+libanv_gen9_la_SOURCES =                                       \
+       genX_cmd_buffer.c                               \
+       gen8_cmd_buffer.c                               \
+       gen8_pipeline.c                                 \
+       gen8_state.c
+libanv_gen9_la_CFLAGS = $(libvulkan_la_CFLAGS) -DANV_GENx10=90
+
+if HAVE_EGL_PLATFORM_WAYLAND
+BUILT_SOURCES += \
+       wayland-drm-protocol.c \
+       wayland-drm-client-protocol.h
+
+%-protocol.c : $(top_srcdir)/src/egl/wayland/wayland-drm/%.xml
+       $(AM_V_GEN)$(WAYLAND_SCANNER) code < $< > $@
+
+%-client-protocol.h : $(top_srcdir)/src/egl/wayland/wayland-drm/%.xml
+       $(AM_V_GEN)$(WAYLAND_SCANNER) client-header < $< > $@
+
+AM_CPPFLAGS += -I$(top_srcdir)/src/egl/wayland/wayland-drm
+VULKAN_SOURCES += \
+       wayland-drm-protocol.c \
+       anv_wsi_wayland.c
+libvulkan_la_CFLAGS += -DHAVE_WAYLAND_PLATFORM
+endif
+
+libvulkan_la_SOURCES =                                  \
+       $(VULKAN_SOURCES)                               \
+       anv_gem.c
+
+anv_entrypoints.h : anv_entrypoints_gen.py $(vulkan_include_HEADERS)
+       $(AM_V_GEN) cat $(vulkan_include_HEADERS) | $(CPP) $(VULKAN_ENTRYPOINT_CPPFLAGS) - | $(PYTHON2) $< header > $@
+
+anv_entrypoints.c : anv_entrypoints_gen.py $(vulkan_include_HEADERS)
+       $(AM_V_GEN) cat $(vulkan_include_HEADERS) | $(CPP) $(VULKAN_ENTRYPOINT_CPPFLAGS) - | $(PYTHON2) $< code > $@
+
+CLEANFILES = $(BUILT_SOURCES)
+
+libvulkan_la_LIBADD = $(WAYLAND_LIBS) -lxcb -lxcb-dri3 \
+       $(top_builddir)/src/isl/libisl.la \
+       $(top_builddir)/src/mesa/drivers/dri/i965/libi965_compiler.la \
+       ../mesa/libmesa.la \
+       ../mesa/drivers/dri/common/libdri_test_stubs.la \
+       -lpthread -ldl -lstdc++ \
+        $(PER_GEN_LIBS)
+
+# Libvulkan with dummy gem. Used for unit tests.
+
+libvulkan_test_la_SOURCES =                             \
+       $(VULKAN_SOURCES)                               \
+       anv_gem_stubs.c
+
+libvulkan_test_la_CFLAGS = $(libvulkan_la_CFLAGS)
+libvulkan_test_la_LIBADD = $(libvulkan_la_LIBADD)
+
+include $(top_srcdir)/install-lib-links.mk
diff --git a/src/vulkan/anv_allocator.c b/src/vulkan/anv_allocator.c

new file mode 100644 (file)

index 0000000..4be149e
--- /dev/null
+++ b/src/vulkan/anv_allocator.c
@@ -0,0 +1,860 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define _DEFAULT_SOURCE
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <values.h>
+#include <assert.h>
+#include <linux/futex.h>
+#include <linux/memfd.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+#include "anv_private.h"
+
+#ifdef HAVE_VALGRIND
+#define VG_NOACCESS_READ(__ptr) ({                       \
+   VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \
+   __typeof(*(__ptr)) __val = *(__ptr);                  \
+   VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\
+   __val;                                                \
+})
+#define VG_NOACCESS_WRITE(__ptr, __val) ({                  \
+   VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr)));  \
+   *(__ptr) = (__val);                                      \
+   VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));   \
+})
+#else
+#define VG_NOACCESS_READ(__ptr) (*(__ptr))
+#define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val))
+#endif
+
+/* Design goals:
+ *
+ *  - Lock free (except when resizing underlying bos)
+ *
+ *  - Constant time allocation with typically only one atomic
+ *
+ *  - Multiple allocation sizes without fragmentation
+ *
+ *  - Can grow while keeping addresses and offset of contents stable
+ *
+ *  - All allocations within one bo so we can point one of the
+ *    STATE_BASE_ADDRESS pointers at it.
+ *
+ * The overall design is a two-level allocator: top level is a fixed size, big
+ * block (8k) allocator, which operates out of a bo.  Allocation is done by
+ * either pulling a block from the free list or growing the used range of the
+ * bo.  Growing the range may run out of space in the bo which we then need to
+ * grow.  Growing the bo is tricky in a multi-threaded, lockless environment:
+ * we need to keep all pointers and contents in the old map valid.  GEM bos in
+ * general can't grow, but we use a trick: we create a memfd and use ftruncate
+ * to grow it as necessary.  We mmap the new size and then create a gem bo for
+ * it using the new gem userptr ioctl.  Without heavy-handed locking around
+ * our allocation fast-path, there isn't really a way to munmap the old mmap,
+ * so we just keep it around until garbage collection time.  While the block
+ * allocator is lockless for normal operations, we block other threads trying
+ * to allocate while we're growing the map.  It sholdn't happen often, and
+ * growing is fast anyway.
+ *
+ * At the next level we can use various sub-allocators.  The state pool is a
+ * pool of smaller, fixed size objects, which operates much like the block
+ * pool.  It uses a free list for freeing objects, but when it runs out of
+ * space it just allocates a new block from the block pool.  This allocator is
+ * intended for longer lived state objects such as SURFACE_STATE and most
+ * other persistent state objects in the API.  We may need to track more info
+ * with these object and a pointer back to the CPU object (eg VkImage).  In
+ * those cases we just allocate a slightly bigger object and put the extra
+ * state after the GPU state object.
+ *
+ * The state stream allocator works similar to how the i965 DRI driver streams
+ * all its state.  Even with Vulkan, we need to emit transient state (whether
+ * surface state base or dynamic state base), and for that we can just get a
+ * block and fill it up.  These cases are local to a command buffer and the
+ * sub-allocator need not be thread safe.  The streaming allocator gets a new
+ * block when it runs out of space and chains them together so they can be
+ * easily freed.
+ */
+
+/* Allocations are always at least 64 byte aligned, so 1 is an invalid value.
+ * We use it to indicate the free list is empty. */
+#define EMPTY 1
+
+struct anv_mmap_cleanup {
+   void *map;
+   size_t size;
+   uint32_t gem_handle;
+};
+
+#define ANV_MMAP_CLEANUP_INIT ((struct anv_mmap_cleanup){0})
+
+static inline long
+sys_futex(void *addr1, int op, int val1,
+          struct timespec *timeout, void *addr2, int val3)
+{
+   return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
+}
+
+static inline int
+futex_wake(uint32_t *addr, int count)
+{
+   return sys_futex(addr, FUTEX_WAKE, count, NULL, NULL, 0);
+}
+
+static inline int
+futex_wait(uint32_t *addr, int32_t value)
+{
+   return sys_futex(addr, FUTEX_WAIT, value, NULL, NULL, 0);
+}
+
+static inline int
+memfd_create(const char *name, unsigned int flags)
+{
+   return syscall(SYS_memfd_create, name, flags);
+}
+
+static inline uint32_t
+ilog2_round_up(uint32_t value)
+{
+   assert(value != 0);
+   return 32 - __builtin_clz(value - 1);
+}
+
+static inline uint32_t
+round_to_power_of_two(uint32_t value)
+{
+   return 1 << ilog2_round_up(value);
+}
+
+static bool
+anv_free_list_pop(union anv_free_list *list, void **map, int32_t *offset)
+{
+   union anv_free_list current, new, old;
+
+   current.u64 = list->u64;
+   while (current.offset != EMPTY) {
+      /* We have to add a memory barrier here so that the list head (and
+       * offset) gets read before we read the map pointer.  This way we
+       * know that the map pointer is valid for the given offset at the
+       * point where we read it.
+       */
+      __sync_synchronize();
+
+      int32_t *next_ptr = *map + current.offset;
+      new.offset = VG_NOACCESS_READ(next_ptr);
+      new.count = current.count + 1;
+      old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
+      if (old.u64 == current.u64) {
+         *offset = current.offset;
+         return true;
+      }
+      current = old;
+   }
+
+   return false;
+}
+
+static void
+anv_free_list_push(union anv_free_list *list, void *map, int32_t offset)
+{
+   union anv_free_list current, old, new;
+   int32_t *next_ptr = map + offset;
+
+   old = *list;
+   do {
+      current = old;
+      VG_NOACCESS_WRITE(next_ptr, current.offset);
+      new.offset = offset;
+      new.count = current.count + 1;
+      old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
+   } while (old.u64 != current.u64);
+}
+
+/* All pointers in the ptr_free_list are assumed to be page-aligned.  This
+ * means that the bottom 12 bits should all be zero.
+ */
+#define PFL_COUNT(x) ((uintptr_t)(x) & 0xfff)
+#define PFL_PTR(x) ((void *)((uintptr_t)(x) & ~0xfff))
+#define PFL_PACK(ptr, count) ({           \
+   assert(((uintptr_t)(ptr) & 0xfff) == 0); \
+   (void *)((uintptr_t)(ptr) | (uintptr_t)((count) & 0xfff)); \
+})
+
+static bool
+anv_ptr_free_list_pop(void **list, void **elem)
+{
+   void *current = *list;
+   while (PFL_PTR(current) != NULL) {
+      void **next_ptr = PFL_PTR(current);
+      void *new_ptr = VG_NOACCESS_READ(next_ptr);
+      unsigned new_count = PFL_COUNT(current) + 1;
+      void *new = PFL_PACK(new_ptr, new_count);
+      void *old = __sync_val_compare_and_swap(list, current, new);
+      if (old == current) {
+         *elem = PFL_PTR(current);
+         return true;
+      }
+      current = old;
+   }
+
+   return false;
+}
+
+static void
+anv_ptr_free_list_push(void **list, void *elem)
+{
+   void *old, *current;
+   void **next_ptr = elem;
+
+   old = *list;
+   do {
+      current = old;
+      VG_NOACCESS_WRITE(next_ptr, PFL_PTR(current));
+      unsigned new_count = PFL_COUNT(current) + 1;
+      void *new = PFL_PACK(elem, new_count);
+      old = __sync_val_compare_and_swap(list, current, new);
+   } while (old != current);
+}
+
+static uint32_t
+anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state);
+
+void
+anv_block_pool_init(struct anv_block_pool *pool,
+                    struct anv_device *device, uint32_t block_size)
+{
+   assert(util_is_power_of_two(block_size));
+
+   pool->device = device;
+   pool->bo.gem_handle = 0;
+   pool->bo.offset = 0;
+   pool->bo.size = 0;
+   pool->block_size = block_size;
+   pool->free_list = ANV_FREE_LIST_EMPTY;
+   pool->back_free_list = ANV_FREE_LIST_EMPTY;
+
+   pool->fd = memfd_create("block pool", MFD_CLOEXEC);
+   if (pool->fd == -1)
+      return;
+
+   /* Just make it 2GB up-front.  The Linux kernel won't actually back it
+    * with pages until we either map and fault on one of them or we use
+    * userptr and send a chunk of it off to the GPU.
+    */
+   if (ftruncate(pool->fd, BLOCK_POOL_MEMFD_SIZE) == -1)
+      return;
+
+   anv_vector_init(&pool->mmap_cleanups,
+                   round_to_power_of_two(sizeof(struct anv_mmap_cleanup)), 128);
+
+   pool->state.next = 0;
+   pool->state.end = 0;
+   pool->back_state.next = 0;
+   pool->back_state.end = 0;
+
+   /* Immediately grow the pool so we'll have a backing bo. */
+   pool->state.end = anv_block_pool_grow(pool, &pool->state);
+}
+
+void
+anv_block_pool_finish(struct anv_block_pool *pool)
+{
+   struct anv_mmap_cleanup *cleanup;
+
+   anv_vector_foreach(cleanup, &pool->mmap_cleanups) {
+      if (cleanup->map)
+         munmap(cleanup->map, cleanup->size);
+      if (cleanup->gem_handle)
+         anv_gem_close(pool->device, cleanup->gem_handle);
+   }
+
+   anv_vector_finish(&pool->mmap_cleanups);
+
+   close(pool->fd);
+}
+
+#define PAGE_SIZE 4096
+
+/** Grows and re-centers the block pool.
+ *
+ * We grow the block pool in one or both directions in such a way that the
+ * following conditions are met:
+ *
+ *  1) The size of the entire pool is always a power of two.
+ *
+ *  2) The pool only grows on both ends.  Neither end can get
+ *     shortened.
+ *
+ *  3) At the end of the allocation, we have about twice as much space
+ *     allocated for each end as we have used.  This way the pool doesn't
+ *     grow too far in one direction or the other.
+ *
+ *  4) If the _alloc_back() has never been called, then the back portion of
+ *     the pool retains a size of zero.  (This makes it easier for users of
+ *     the block pool that only want a one-sided pool.)
+ *
+ *  5) We have enough space allocated for at least one more block in
+ *     whichever side `state` points to.
+ *
+ *  6) The center of the pool is always aligned to both the block_size of
+ *     the pool and a 4K CPU page.
+ */
+static uint32_t
+anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state)
+{
+   size_t size;
+   void *map;
+   uint32_t gem_handle;
+   struct anv_mmap_cleanup *cleanup;
+
+   pthread_mutex_lock(&pool->device->mutex);
+
+   assert(state == &pool->state || state == &pool->back_state);
+
+   /* Gather a little usage information on the pool.  Since we may have
+    * threadsd waiting in queue to get some storage while we resize, it's
+    * actually possible that total_used will be larger than old_size.  In
+    * particular, block_pool_alloc() increments state->next prior to
+    * calling block_pool_grow, so this ensures that we get enough space for
+    * which ever side tries to grow the pool.
+    *
+    * We align to a page size because it makes it easier to do our
+    * calculations later in such a way that we state page-aigned.
+    */
+   uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE);
+   uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE);
+   uint32_t total_used = front_used + back_used;
+
+   assert(state == &pool->state || back_used > 0);
+
+   size_t old_size = pool->bo.size;
+
+   if (old_size != 0 &&
+       back_used * 2 <= pool->center_bo_offset &&
+       front_used * 2 <= (old_size - pool->center_bo_offset)) {
+      /* If we're in this case then this isn't the firsta allocation and we
+       * already have enough space on both sides to hold double what we
+       * have allocated.  There's nothing for us to do.
+       */
+      goto done;
+   }
+
+   if (old_size == 0) {
+      /* This is the first allocation */
+      size = MAX2(32 * pool->block_size, PAGE_SIZE);
+   } else {
+      size = old_size * 2;
+   }
+
+   /* We can't have a block pool bigger than 1GB because we use signed
+    * 32-bit offsets in the free list and we don't want overflow.  We
+    * should never need a block pool bigger than 1GB anyway.
+    */
+   assert(size <= (1u << 31));
+
+   /* We compute a new center_bo_offset such that, when we double the size
+    * of the pool, we maintain the ratio of how much is used by each side.
+    * This way things should remain more-or-less balanced.
+    */
+   uint32_t center_bo_offset;
+   if (back_used == 0) {
+      /* If we're in this case then we have never called alloc_back().  In
+       * this case, we want keep the offset at 0 to make things as simple
+       * as possible for users that don't care about back allocations.
+       */
+      center_bo_offset = 0;
+   } else {
+      /* Try to "center" the allocation based on how much is currently in
+       * use on each side of the center line.
+       */
+      center_bo_offset = ((uint64_t)size * back_used) / total_used;
+
+      /* Align down to a multiple of both the block size and page size */
+      uint32_t granularity = MAX2(pool->block_size, PAGE_SIZE);
+      assert(util_is_power_of_two(granularity));
+      center_bo_offset &= ~(granularity - 1);
+
+      assert(center_bo_offset >= back_used);
+
+      /* Make sure we don't shrink the back end of the pool */
+      if (center_bo_offset < pool->back_state.end)
+         center_bo_offset = pool->back_state.end;
+
+      /* Make sure that we don't shrink the front end of the pool */
+      if (size - center_bo_offset < pool->state.end)
+         center_bo_offset = size - pool->state.end;
+   }
+
+   assert(center_bo_offset % pool->block_size == 0);
+   assert(center_bo_offset % PAGE_SIZE == 0);
+
+   /* Assert that we only ever grow the pool */
+   assert(center_bo_offset >= pool->back_state.end);
+   assert(size - center_bo_offset >= pool->state.end);
+
+   cleanup = anv_vector_add(&pool->mmap_cleanups);
+   if (!cleanup)
+      goto fail;
+   *cleanup = ANV_MMAP_CLEANUP_INIT;
+
+   /* Just leak the old map until we destroy the pool.  We can't munmap it
+    * without races or imposing locking on the block allocate fast path. On
+    * the whole the leaked maps adds up to less than the size of the
+    * current map.  MAP_POPULATE seems like the right thing to do, but we
+    * should try to get some numbers.
+    */
+   map = mmap(NULL, size, PROT_READ | PROT_WRITE,
+              MAP_SHARED | MAP_POPULATE, pool->fd,
+              BLOCK_POOL_MEMFD_CENTER - center_bo_offset);
+   cleanup->map = map;
+   cleanup->size = size;
+
+   if (map == MAP_FAILED)
+      goto fail;
+
+   gem_handle = anv_gem_userptr(pool->device, map, size);
+   if (gem_handle == 0)
+      goto fail;
+   cleanup->gem_handle = gem_handle;
+
+   /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
+    * I915_CACHING_NONE on non-LLC platforms. However, userptr objects are
+    * always created as I915_CACHING_CACHED, which on non-LLC means
+    * snooped. That can be useful but comes with a bit of overheard.  Since
+    * we're eplicitly clflushing and don't want the overhead we need to turn
+    * it off. */
+   if (!pool->device->info.has_llc) {
+      anv_gem_set_caching(pool->device, gem_handle, I915_CACHING_NONE);
+      anv_gem_set_domain(pool->device, gem_handle,
+                         I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
+   }
+
+   /* Now that we successfull allocated everything, we can write the new
+    * values back into pool. */
+   pool->map = map + center_bo_offset;
+   pool->center_bo_offset = center_bo_offset;
+   pool->bo.gem_handle = gem_handle;
+   pool->bo.size = size;
+   pool->bo.map = map;
+   pool->bo.index = 0;
+
+done:
+   pthread_mutex_unlock(&pool->device->mutex);
+
+   /* Return the appropreate new size.  This function never actually
+    * updates state->next.  Instead, we let the caller do that because it
+    * needs to do so in order to maintain its concurrency model.
+    */
+   if (state == &pool->state) {
+      return pool->bo.size - pool->center_bo_offset;
+   } else {
+      assert(pool->center_bo_offset > 0);
+      return pool->center_bo_offset;
+   }
+
+fail:
+   pthread_mutex_unlock(&pool->device->mutex);
+
+   return 0;
+}
+
+static uint32_t
+anv_block_pool_alloc_new(struct anv_block_pool *pool,
+                         struct anv_block_state *pool_state)
+{
+   struct anv_block_state state, old, new;
+
+   while (1) {
+      state.u64 = __sync_fetch_and_add(&pool_state->u64, pool->block_size);
+      if (state.next < state.end) {
+         assert(pool->map);
+         return state.next;
+      } else if (state.next == state.end) {
+         /* We allocated the first block outside the pool, we have to grow it.
+          * pool_state->next acts a mutex: threads who try to allocate now will
+          * get block indexes above the current limit and hit futex_wait
+          * below. */
+         new.next = state.next + pool->block_size;
+         new.end = anv_block_pool_grow(pool, pool_state);
+         assert(new.end >= new.next && new.end % pool->block_size == 0);
+         old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
+         if (old.next != state.next)
+            futex_wake(&pool_state->end, INT_MAX);
+         return state.next;
+      } else {
+         futex_wait(&pool_state->end, state.end);
+         continue;
+      }
+   }
+}
+
+int32_t
+anv_block_pool_alloc(struct anv_block_pool *pool)
+{
+   int32_t offset;
+
+   /* Try free list first. */
+   if (anv_free_list_pop(&pool->free_list, &pool->map, &offset)) {
+      assert(offset >= 0);
+      assert(pool->map);
+      return offset;
+   }
+
+   return anv_block_pool_alloc_new(pool, &pool->state);
+}
+
+/* Allocates a block out of the back of the block pool.
+ *
+ * This will allocated a block earlier than the "start" of the block pool.
+ * The offsets returned from this function will be negative but will still
+ * be correct relative to the block pool's map pointer.
+ *
+ * If you ever use anv_block_pool_alloc_back, then you will have to do
+ * gymnastics with the block pool's BO when doing relocations.
+ */
+int32_t
+anv_block_pool_alloc_back(struct anv_block_pool *pool)
+{
+   int32_t offset;
+
+   /* Try free list first. */
+   if (anv_free_list_pop(&pool->back_free_list, &pool->map, &offset)) {
+      assert(offset < 0);
+      assert(pool->map);
+      return offset;
+   }
+
+   offset = anv_block_pool_alloc_new(pool, &pool->back_state);
+
+   /* The offset we get out of anv_block_pool_alloc_new() is actually the
+    * number of bytes downwards from the middle to the end of the block.
+    * We need to turn it into a (negative) offset from the middle to the
+    * start of the block.
+    */
+   assert(offset >= 0);
+   return -(offset + pool->block_size);
+}
+
+void
+anv_block_pool_free(struct anv_block_pool *pool, int32_t offset)
+{
+   if (offset < 0) {
+      anv_free_list_push(&pool->back_free_list, pool->map, offset);
+   } else {
+      anv_free_list_push(&pool->free_list, pool->map, offset);
+   }
+}
+
+static void
+anv_fixed_size_state_pool_init(struct anv_fixed_size_state_pool *pool,
+                               size_t state_size)
+{
+   /* At least a cache line and must divide the block size. */
+   assert(state_size >= 64 && util_is_power_of_two(state_size));
+
+   pool->state_size = state_size;
+   pool->free_list = ANV_FREE_LIST_EMPTY;
+   pool->block.next = 0;
+   pool->block.end = 0;
+}
+
+static uint32_t
+anv_fixed_size_state_pool_alloc(struct anv_fixed_size_state_pool *pool,
+                                struct anv_block_pool *block_pool)
+{
+   int32_t offset;
+   struct anv_block_state block, old, new;
+
+   /* Try free list first. */
+   if (anv_free_list_pop(&pool->free_list, &block_pool->map, &offset)) {
+      assert(offset >= 0);
+      return offset;
+   }
+
+   /* If free list was empty (or somebody raced us and took the items) we
+    * allocate a new item from the end of the block */
+ restart:
+   block.u64 = __sync_fetch_and_add(&pool->block.u64, pool->state_size);
+
+   if (block.next < block.end) {
+      return block.next;
+   } else if (block.next == block.end) {
+      offset = anv_block_pool_alloc(block_pool);
+      new.next = offset + pool->state_size;
+      new.end = offset + block_pool->block_size;
+      old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
+      if (old.next != block.next)
+         futex_wake(&pool->block.end, INT_MAX);
+      return offset;
+   } else {
+      futex_wait(&pool->block.end, block.end);
+      goto restart;
+   }
+}
+
+static void
+anv_fixed_size_state_pool_free(struct anv_fixed_size_state_pool *pool,
+                               struct anv_block_pool *block_pool,
+                               uint32_t offset)
+{
+   anv_free_list_push(&pool->free_list, block_pool->map, offset);
+}
+
+void
+anv_state_pool_init(struct anv_state_pool *pool,
+                    struct anv_block_pool *block_pool)
+{
+   pool->block_pool = block_pool;
+   for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
+      size_t size = 1 << (ANV_MIN_STATE_SIZE_LOG2 + i);
+      anv_fixed_size_state_pool_init(&pool->buckets[i], size);
+   }
+   VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
+}
+
+void
+anv_state_pool_finish(struct anv_state_pool *pool)
+{
+   VG(VALGRIND_DESTROY_MEMPOOL(pool));
+}
+
+struct anv_state
+anv_state_pool_alloc(struct anv_state_pool *pool, size_t size, size_t align)
+{
+   unsigned size_log2 = ilog2_round_up(size < align ? align : size);
+   assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
+   if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
+      size_log2 = ANV_MIN_STATE_SIZE_LOG2;
+   unsigned bucket = size_log2 - ANV_MIN_STATE_SIZE_LOG2;
+
+   struct anv_state state;
+   state.alloc_size = 1 << size_log2;
+   state.offset = anv_fixed_size_state_pool_alloc(&pool->buckets[bucket],
+                                                  pool->block_pool);
+   state.map = pool->block_pool->map + state.offset;
+   VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));
+   return state;
+}
+
+void
+anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
+{
+   assert(util_is_power_of_two(state.alloc_size));
+   unsigned size_log2 = ilog2_round_up(state.alloc_size);
+   assert(size_log2 >= ANV_MIN_STATE_SIZE_LOG2 &&
+          size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
+   unsigned bucket = size_log2 - ANV_MIN_STATE_SIZE_LOG2;
+
+   VG(VALGRIND_MEMPOOL_FREE(pool, state.map));
+   anv_fixed_size_state_pool_free(&pool->buckets[bucket],
+                                  pool->block_pool, state.offset);
+}
+
+#define NULL_BLOCK 1
+struct anv_state_stream_block {
+   /* The next block */
+   struct anv_state_stream_block *next;
+
+   /* The offset into the block pool at which this block starts */
+   uint32_t offset;
+
+#ifdef HAVE_VALGRIND
+   /* A pointer to the first user-allocated thing in this block.  This is
+    * what valgrind sees as the start of the block.
+    */
+   void *_vg_ptr;
+#endif
+};
+
+/* The state stream allocator is a one-shot, single threaded allocator for
+ * variable sized blocks.  We use it for allocating dynamic state.
+ */
+void
+anv_state_stream_init(struct anv_state_stream *stream,
+                      struct anv_block_pool *block_pool)
+{
+   stream->block_pool = block_pool;
+   stream->block = NULL;
+
+   /* Ensure that next + whatever > end.  This way the first call to
+    * state_stream_alloc fetches a new block.
+    */
+   stream->next = 1;
+   stream->end = 0;
+
+   VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
+}
+
+void
+anv_state_stream_finish(struct anv_state_stream *stream)
+{
+   const uint32_t block_size = stream->block_pool->block_size;
+
+   struct anv_state_stream_block *next = stream->block;
+   while (next != NULL) {
+      VG(VALGRIND_MAKE_MEM_DEFINED(next, sizeof(*next)));
+      struct anv_state_stream_block sb = VG_NOACCESS_READ(next);
+      VG(VALGRIND_MEMPOOL_FREE(stream, sb._vg_ptr));
+      VG(VALGRIND_MAKE_MEM_UNDEFINED(next, block_size));
+      anv_block_pool_free(stream->block_pool, sb.offset);
+      next = sb.next;
+   }
+
+   VG(VALGRIND_DESTROY_MEMPOOL(stream));
+}
+
+struct anv_state
+anv_state_stream_alloc(struct anv_state_stream *stream,
+                       uint32_t size, uint32_t alignment)
+{
+   struct anv_state_stream_block *sb = stream->block;
+
+   struct anv_state state;
+
+   state.offset = align_u32(stream->next, alignment);
+   if (state.offset + size > stream->end) {
+      uint32_t block = anv_block_pool_alloc(stream->block_pool);
+      sb = stream->block_pool->map + block;
+
+      VG(VALGRIND_MAKE_MEM_UNDEFINED(sb, sizeof(*sb)));
+      sb->next = stream->block;
+      sb->offset = block;
+      VG(sb->_vg_ptr = NULL);
+      VG(VALGRIND_MAKE_MEM_NOACCESS(sb, stream->block_pool->block_size));
+
+      stream->block = sb;
+      stream->start = block;
+      stream->next = block + sizeof(*sb);
+      stream->end = block + stream->block_pool->block_size;
+
+      state.offset = align_u32(stream->next, alignment);
+      assert(state.offset + size <= stream->end);
+   }
+
+   assert(state.offset > stream->start);
+   state.map = (void *)sb + (state.offset - stream->start);
+   state.alloc_size = size;
+
+#ifdef HAVE_VALGRIND
+   void *vg_ptr = VG_NOACCESS_READ(&sb->_vg_ptr);
+   if (vg_ptr == NULL) {
+      vg_ptr = state.map;
+      VG_NOACCESS_WRITE(&sb->_vg_ptr, vg_ptr);
+      VALGRIND_MEMPOOL_ALLOC(stream, vg_ptr, size);
+   } else {
+      void *state_end = state.map + state.alloc_size;
+      /* This only updates the mempool.  The newly allocated chunk is still
+       * marked as NOACCESS. */
+      VALGRIND_MEMPOOL_CHANGE(stream, vg_ptr, vg_ptr, state_end - vg_ptr);
+      /* Mark the newly allocated chunk as undefined */
+      VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size);
+   }
+#endif
+
+   stream->next = state.offset + size;
+
+   return state;
+}
+
+struct bo_pool_bo_link {
+   struct bo_pool_bo_link *next;
+   struct anv_bo bo;
+};
+
+void
+anv_bo_pool_init(struct anv_bo_pool *pool,
+                 struct anv_device *device, uint32_t bo_size)
+{
+   pool->device = device;
+   pool->bo_size = bo_size;
+   pool->free_list = NULL;
+
+   VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
+}
+
+void
+anv_bo_pool_finish(struct anv_bo_pool *pool)
+{
+   struct bo_pool_bo_link *link = PFL_PTR(pool->free_list);
+   while (link != NULL) {
+      struct bo_pool_bo_link link_copy = VG_NOACCESS_READ(link);
+
+      anv_gem_munmap(link_copy.bo.map, pool->bo_size);
+      anv_gem_close(pool->device, link_copy.bo.gem_handle);
+      link = link_copy.next;
+   }
+
+   VG(VALGRIND_DESTROY_MEMPOOL(pool));
+}
+
+VkResult
+anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo)
+{
+   VkResult result;
+
+   void *next_free_void;
+   if (anv_ptr_free_list_pop(&pool->free_list, &next_free_void)) {
+      struct bo_pool_bo_link *next_free = next_free_void;
+      *bo = VG_NOACCESS_READ(&next_free->bo);
+      assert(bo->map == next_free);
+      assert(bo->size == pool->bo_size);
+
+      VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, pool->bo_size));
+
+      return VK_SUCCESS;
+   }
+
+   struct anv_bo new_bo;
+
+   result = anv_bo_init_new(&new_bo, pool->device, pool->bo_size);
+   if (result != VK_SUCCESS)
+      return result;
+
+   assert(new_bo.size == pool->bo_size);
+
+   new_bo.map = anv_gem_mmap(pool->device, new_bo.gem_handle, 0, pool->bo_size, 0);
+   if (new_bo.map == NULL) {
+      anv_gem_close(pool->device, new_bo.gem_handle);
+      return vk_error(VK_ERROR_MEMORY_MAP_FAILED);
+   }
+
+   *bo = new_bo;
+
+   VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, pool->bo_size));
+
+   return VK_SUCCESS;
+}
+
+void
+anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo)
+{
+   struct bo_pool_bo_link *link = bo->map;
+   link->bo = *bo;
+
+   VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
+   anv_ptr_free_list_push(&pool->free_list, link);
+}
diff --git a/src/vulkan/anv_aub.h b/src/vulkan/anv_aub.h

new file mode 100644 (file)

index 0000000..7a67712
--- /dev/null
+++ b/src/vulkan/anv_aub.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file intel_aub.h
+ *
+ * The AUB file is a file format used by Intel's internal simulation
+ * and other validation tools.  It can be used at various levels by a
+ * driver to input state to the simulated hardware or a replaying
+ * debugger.
+ *
+ * We choose to dump AUB files using the trace block format for ease
+ * of implementation -- dump out the blocks of memory as plain blobs
+ * and insert ring commands to execute the batchbuffer blob.
+ */
+
+#ifndef _INTEL_AUB_H
+#define _INTEL_AUB_H
+
+#define AUB_MI_NOOP                    (0)
+#define AUB_MI_BATCH_BUFFER_START      (0x31 << 23)
+#define AUB_PIPE_CONTROL               (0x7a000002)
+
+/* DW0: instruction type. */
+
+#define CMD_AUB                        (7 << 29)
+
+#define CMD_AUB_HEADER         (CMD_AUB | (1 << 23) | (0x05 << 16))
+/* DW1 */
+# define AUB_HEADER_MAJOR_SHIFT                24
+# define AUB_HEADER_MINOR_SHIFT                16
+
+#define CMD_AUB_TRACE_HEADER_BLOCK (CMD_AUB | (1 << 23) | (0x41 << 16))
+#define CMD_AUB_DUMP_BMP           (CMD_AUB | (1 << 23) | (0x9e << 16))
+
+/* DW1 */
+#define AUB_TRACE_OPERATION_MASK       0x000000ff
+#define AUB_TRACE_OP_COMMENT           0x00000000
+#define AUB_TRACE_OP_DATA_WRITE                0x00000001
+#define AUB_TRACE_OP_COMMAND_WRITE     0x00000002
+#define AUB_TRACE_OP_MMIO_WRITE                0x00000003
+// operation = TRACE_DATA_WRITE, Type
+#define AUB_TRACE_TYPE_MASK            0x0000ff00
+#define AUB_TRACE_TYPE_NOTYPE          (0 << 8)
+#define AUB_TRACE_TYPE_BATCH           (1 << 8)
+#define AUB_TRACE_TYPE_VERTEX_BUFFER   (5 << 8)
+#define AUB_TRACE_TYPE_2D_MAP          (6 << 8)
+#define AUB_TRACE_TYPE_CUBE_MAP                (7 << 8)
+#define AUB_TRACE_TYPE_VOLUME_MAP      (9 << 8)
+#define AUB_TRACE_TYPE_1D_MAP          (10 << 8)
+#define AUB_TRACE_TYPE_CONSTANT_BUFFER (11 << 8)
+#define AUB_TRACE_TYPE_CONSTANT_URB    (12 << 8)
+#define AUB_TRACE_TYPE_INDEX_BUFFER    (13 << 8)
+#define AUB_TRACE_TYPE_GENERAL         (14 << 8)
+#define AUB_TRACE_TYPE_SURFACE         (15 << 8)
+
+
+// operation = TRACE_COMMAND_WRITE, Type =
+#define AUB_TRACE_TYPE_RING_HWB                (1 << 8)
+#define AUB_TRACE_TYPE_RING_PRB0       (2 << 8)
+#define AUB_TRACE_TYPE_RING_PRB1       (3 << 8)
+#define AUB_TRACE_TYPE_RING_PRB2       (4 << 8)
+
+// Address space
+#define AUB_TRACE_ADDRESS_SPACE_MASK   0x00ff0000
+#define AUB_TRACE_MEMTYPE_GTT          (0 << 16)
+#define AUB_TRACE_MEMTYPE_LOCAL                (1 << 16)
+#define AUB_TRACE_MEMTYPE_NONLOCAL     (2 << 16)
+#define AUB_TRACE_MEMTYPE_PCI          (3 << 16)
+#define AUB_TRACE_MEMTYPE_GTT_ENTRY     (4 << 16)
+
+/* DW2 */
+
+/**
+ * aub_state_struct_type enum values are encoded with the top 16 bits
+ * representing the type to be delivered to the .aub file, and the bottom 16
+ * bits representing the subtype.  This macro performs the encoding.
+ */
+#define ENCODE_SS_TYPE(type, subtype) (((type) << 16) | (subtype))
+
+enum aub_state_struct_type {
+   AUB_TRACE_VS_STATE =                        ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 1),
+   AUB_TRACE_GS_STATE =                        ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 2),
+   AUB_TRACE_CLIP_STATE =              ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 3),
+   AUB_TRACE_SF_STATE =                        ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 4),
+   AUB_TRACE_WM_STATE =                        ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 5),
+   AUB_TRACE_CC_STATE =                        ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 6),
+   AUB_TRACE_CLIP_VP_STATE =           ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 7),
+   AUB_TRACE_SF_VP_STATE =             ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 8),
+   AUB_TRACE_CC_VP_STATE =             ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0x9),
+   AUB_TRACE_SAMPLER_STATE =           ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0xa),
+   AUB_TRACE_KERNEL_INSTRUCTIONS =     ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0xb),
+   AUB_TRACE_SCRATCH_SPACE =           ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0xc),
+   AUB_TRACE_SAMPLER_DEFAULT_COLOR =   ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0xd),
+
+   AUB_TRACE_SCISSOR_STATE =           ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0x15),
+   AUB_TRACE_BLEND_STATE =             ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0x16),
+   AUB_TRACE_DEPTH_STENCIL_STATE =     ENCODE_SS_TYPE(AUB_TRACE_TYPE_GENERAL, 0x17),
+
+   AUB_TRACE_VERTEX_BUFFER =           ENCODE_SS_TYPE(AUB_TRACE_TYPE_VERTEX_BUFFER, 0),
+   AUB_TRACE_BINDING_TABLE =           ENCODE_SS_TYPE(AUB_TRACE_TYPE_SURFACE, 0x100),
+   AUB_TRACE_SURFACE_STATE =           ENCODE_SS_TYPE(AUB_TRACE_TYPE_SURFACE, 0x200),
+   AUB_TRACE_VS_CONSTANTS =            ENCODE_SS_TYPE(AUB_TRACE_TYPE_CONSTANT_BUFFER, 0),
+   AUB_TRACE_WM_CONSTANTS =            ENCODE_SS_TYPE(AUB_TRACE_TYPE_CONSTANT_BUFFER, 1),
+};
+
+#undef ENCODE_SS_TYPE
+
+/**
+ * Decode a aub_state_struct_type value to determine the type that should be
+ * stored in the .aub file.
+ */
+static inline uint32_t AUB_TRACE_TYPE(enum aub_state_struct_type ss_type)
+{
+   return (ss_type & 0xFFFF0000) >> 16;
+}
+
+/**
+ * Decode a state_struct_type value to determine the subtype that should be
+ * stored in the .aub file.
+ */
+static inline uint32_t AUB_TRACE_SUBTYPE(enum aub_state_struct_type ss_type)
+{
+   return ss_type & 0xFFFF;
+}
+
+/* DW3: address */
+/* DW4: len */
+
+#endif /* _INTEL_AUB_H */
diff --git a/src/vulkan/anv_batch_chain.c b/src/vulkan/anv_batch_chain.c

new file mode 100644 (file)

index 0000000..89215fe
--- /dev/null
+++ b/src/vulkan/anv_batch_chain.c
@@ -0,0 +1,1053 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "gen7_pack.h"
+#include "gen8_pack.h"
+
+/** \file anv_batch_chain.c
+ *
+ * This file contains functions related to anv_cmd_buffer as a data
+ * structure.  This involves everything required to create and destroy
+ * the actual batch buffers as well as link them together and handle
+ * relocations and surface state.  It specifically does *not* contain any
+ * handling of actual vkCmd calls beyond vkCmdExecuteCommands.
+ */
+
+/*-----------------------------------------------------------------------*
+ * Functions related to anv_reloc_list
+ *-----------------------------------------------------------------------*/
+
+static VkResult
+anv_reloc_list_init_clone(struct anv_reloc_list *list,
+                          const VkAllocationCallbacks *alloc,
+                          const struct anv_reloc_list *other_list)
+{
+   if (other_list) {
+      list->num_relocs = other_list->num_relocs;
+      list->array_length = other_list->array_length;
+   } else {
+      list->num_relocs = 0;
+      list->array_length = 256;
+   }
+
+   list->relocs =
+      anv_alloc(alloc, list->array_length * sizeof(*list->relocs), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (list->relocs == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   list->reloc_bos =
+      anv_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (list->reloc_bos == NULL) {
+      anv_free(alloc, list->relocs);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   if (other_list) {
+      memcpy(list->relocs, other_list->relocs,
+             list->array_length * sizeof(*list->relocs));
+      memcpy(list->reloc_bos, other_list->reloc_bos,
+             list->array_length * sizeof(*list->reloc_bos));
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_reloc_list_init(struct anv_reloc_list *list,
+                    const VkAllocationCallbacks *alloc)
+{
+   return anv_reloc_list_init_clone(list, alloc, NULL);
+}
+
+void
+anv_reloc_list_finish(struct anv_reloc_list *list,
+                      const VkAllocationCallbacks *alloc)
+{
+   anv_free(alloc, list->relocs);
+   anv_free(alloc, list->reloc_bos);
+}
+
+static VkResult
+anv_reloc_list_grow(struct anv_reloc_list *list,
+                    const VkAllocationCallbacks *alloc,
+                    size_t num_additional_relocs)
+{
+   if (list->num_relocs + num_additional_relocs <= list->array_length)
+      return VK_SUCCESS;
+
+   size_t new_length = list->array_length * 2;
+   while (new_length < list->num_relocs + num_additional_relocs)
+      new_length *= 2;
+
+   struct drm_i915_gem_relocation_entry *new_relocs =
+      anv_alloc(alloc, new_length * sizeof(*list->relocs), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (new_relocs == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct anv_bo **new_reloc_bos =
+      anv_alloc(alloc, new_length * sizeof(*list->reloc_bos), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (new_relocs == NULL) {
+      anv_free(alloc, new_relocs);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   memcpy(new_relocs, list->relocs, list->num_relocs * sizeof(*list->relocs));
+   memcpy(new_reloc_bos, list->reloc_bos,
+          list->num_relocs * sizeof(*list->reloc_bos));
+
+   anv_free(alloc, list->relocs);
+   anv_free(alloc, list->reloc_bos);
+
+   list->array_length = new_length;
+   list->relocs = new_relocs;
+   list->reloc_bos = new_reloc_bos;
+
+   return VK_SUCCESS;
+}
+
+uint64_t
+anv_reloc_list_add(struct anv_reloc_list *list,
+                   const VkAllocationCallbacks *alloc,
+                   uint32_t offset, struct anv_bo *target_bo, uint32_t delta)
+{
+   struct drm_i915_gem_relocation_entry *entry;
+   int index;
+
+   anv_reloc_list_grow(list, alloc, 1);
+   /* TODO: Handle failure */
+
+   /* XXX: Can we use I915_EXEC_HANDLE_LUT? */
+   index = list->num_relocs++;
+   list->reloc_bos[index] = target_bo;
+   entry = &list->relocs[index];
+   entry->target_handle = target_bo->gem_handle;
+   entry->delta = delta;
+   entry->offset = offset;
+   entry->presumed_offset = target_bo->offset;
+   entry->read_domains = 0;
+   entry->write_domain = 0;
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry)));
+
+   return target_bo->offset + delta;
+}
+
+static void
+anv_reloc_list_append(struct anv_reloc_list *list,
+                      const VkAllocationCallbacks *alloc,
+                      struct anv_reloc_list *other, uint32_t offset)
+{
+   anv_reloc_list_grow(list, alloc, other->num_relocs);
+   /* TODO: Handle failure */
+
+   memcpy(&list->relocs[list->num_relocs], &other->relocs[0],
+          other->num_relocs * sizeof(other->relocs[0]));
+   memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0],
+          other->num_relocs * sizeof(other->reloc_bos[0]));
+
+   for (uint32_t i = 0; i < other->num_relocs; i++)
+      list->relocs[i + list->num_relocs].offset += offset;
+
+   list->num_relocs += other->num_relocs;
+}
+
+/*-----------------------------------------------------------------------*
+ * Functions related to anv_batch
+ *-----------------------------------------------------------------------*/
+
+void *
+anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
+{
+   if (batch->next + num_dwords * 4 > batch->end)
+      batch->extend_cb(batch, batch->user_data);
+
+   void *p = batch->next;
+
+   batch->next += num_dwords * 4;
+   assert(batch->next <= batch->end);
+
+   return p;
+}
+
+uint64_t
+anv_batch_emit_reloc(struct anv_batch *batch,
+                     void *location, struct anv_bo *bo, uint32_t delta)
+{
+   return anv_reloc_list_add(batch->relocs, batch->alloc,
+                             location - batch->start, bo, delta);
+}
+
+void
+anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
+{
+   uint32_t size, offset;
+
+   size = other->next - other->start;
+   assert(size % 4 == 0);
+
+   if (batch->next + size > batch->end)
+      batch->extend_cb(batch, batch->user_data);
+
+   assert(batch->next + size <= batch->end);
+
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));
+   memcpy(batch->next, other->start, size);
+
+   offset = batch->next - batch->start;
+   anv_reloc_list_append(batch->relocs, batch->alloc,
+                         other->relocs, offset);
+
+   batch->next += size;
+}
+
+/*-----------------------------------------------------------------------*
+ * Functions related to anv_batch_bo
+ *-----------------------------------------------------------------------*/
+
+static VkResult
+anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_batch_bo **bbo_out)
+{
+   VkResult result;
+
+   struct anv_batch_bo *bbo = anv_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),
+                                        8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (bbo == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->pool->alloc);
+   if (result != VK_SUCCESS)
+      goto fail_bo_alloc;
+
+   *bbo_out = bbo;
+
+   return VK_SUCCESS;
+
+ fail_bo_alloc:
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
+ fail_alloc:
+   anv_free(&cmd_buffer->pool->alloc, bbo);
+
+   return result;
+}
+
+static VkResult
+anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
+                   const struct anv_batch_bo *other_bbo,
+                   struct anv_batch_bo **bbo_out)
+{
+   VkResult result;
+
+   struct anv_batch_bo *bbo = anv_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),
+                                        8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (bbo == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->pool->alloc,
+                                      &other_bbo->relocs);
+   if (result != VK_SUCCESS)
+      goto fail_bo_alloc;
+
+   bbo->length = other_bbo->length;
+   memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length);
+
+   bbo->last_ss_pool_bo_offset = other_bbo->last_ss_pool_bo_offset;
+
+   *bbo_out = bbo;
+
+   return VK_SUCCESS;
+
+ fail_bo_alloc:
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
+ fail_alloc:
+   anv_free(&cmd_buffer->pool->alloc, bbo);
+
+   return result;
+}
+
+static void
+anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,
+                   size_t batch_padding)
+{
+   batch->next = batch->start = bbo->bo.map;
+   batch->end = bbo->bo.map + bbo->bo.size - batch_padding;
+   batch->relocs = &bbo->relocs;
+   bbo->last_ss_pool_bo_offset = 0;
+   bbo->relocs.num_relocs = 0;
+}
+
+static void
+anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch,
+                      size_t batch_padding)
+{
+   batch->start = bbo->bo.map;
+   batch->next = bbo->bo.map + bbo->length;
+   batch->end = bbo->bo.map + bbo->bo.size - batch_padding;
+   batch->relocs = &bbo->relocs;
+}
+
+static void
+anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
+{
+   assert(batch->start == bbo->bo.map);
+   bbo->length = batch->next - batch->start;
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));
+}
+
+static void
+anv_batch_bo_destroy(struct anv_batch_bo *bbo,
+                     struct anv_cmd_buffer *cmd_buffer)
+{
+   anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc);
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
+   anv_free(&cmd_buffer->pool->alloc, bbo);
+}
+
+static VkResult
+anv_batch_bo_list_clone(const struct list_head *list,
+                        struct anv_cmd_buffer *cmd_buffer,
+                        struct list_head *new_list)
+{
+   VkResult result = VK_SUCCESS;
+
+   list_inithead(new_list);
+
+   struct anv_batch_bo *prev_bbo = NULL;
+   list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
+      struct anv_batch_bo *new_bbo;
+      result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo);
+      if (result != VK_SUCCESS)
+         break;
+      list_addtail(&new_bbo->link, new_list);
+
+      if (prev_bbo) {
+         /* As we clone this list of batch_bo's, they chain one to the
+          * other using MI_BATCH_BUFFER_START commands.  We need to fix up
+          * those relocations as we go.  Fortunately, this is pretty easy
+          * as it will always be the last relocation in the list.
+          */
+         uint32_t last_idx = prev_bbo->relocs.num_relocs - 1;
+         assert(prev_bbo->relocs.reloc_bos[last_idx] == &bbo->bo);
+         prev_bbo->relocs.reloc_bos[last_idx] = &new_bbo->bo;
+      }
+
+      prev_bbo = new_bbo;
+   }
+
+   if (result != VK_SUCCESS) {
+      list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link)
+         anv_batch_bo_destroy(bbo, cmd_buffer);
+   }
+
+   return result;
+}
+
+/*-----------------------------------------------------------------------*
+ * Functions related to anv_batch_bo
+ *-----------------------------------------------------------------------*/
+
+static inline struct anv_batch_bo *
+anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
+{
+   return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->batch_bos.prev, link);
+}
+
+struct anv_address
+anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
+{
+   return (struct anv_address) {
+      .bo = &cmd_buffer->device->surface_state_block_pool.bo,
+      .offset = *(int32_t *)anv_vector_head(&cmd_buffer->bt_blocks),
+   };
+}
+
+static void
+emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer,
+                        struct anv_bo *bo, uint32_t offset)
+{
+   /* In gen8+ the address field grew to two dwords to accomodate 48 bit
+    * offsets. The high 16 bits are in the last dword, so we can use the gen8
+    * version in either case, as long as we set the instruction length in the
+    * header accordingly.  This means that we always emit three dwords here
+    * and all the padding and adjustment we do in this file works for all
+    * gens.
+    */
+
+   const uint32_t gen7_length =
+      GEN7_MI_BATCH_BUFFER_START_length - GEN7_MI_BATCH_BUFFER_START_length_bias;
+   const uint32_t gen8_length =
+      GEN8_MI_BATCH_BUFFER_START_length - GEN8_MI_BATCH_BUFFER_START_length_bias;
+
+   anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_START,
+      .DwordLength = cmd_buffer->device->info.gen < 8 ?
+                     gen7_length : gen8_length,
+      ._2ndLevelBatchBuffer = _1stlevelbatch,
+      .AddressSpaceIndicator = ASI_PPGTT,
+      .BatchBufferStartAddress = { bo, offset });
+}
+
+static void
+cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_batch_bo *bbo)
+{
+   struct anv_batch *batch = &cmd_buffer->batch;
+   struct anv_batch_bo *current_bbo =
+      anv_cmd_buffer_current_batch_bo(cmd_buffer);
+
+   /* We set the end of the batch a little short so we would be sure we
+    * have room for the chaining command.  Since we're about to emit the
+    * chaining command, let's set it back where it should go.
+    */
+   batch->end += GEN8_MI_BATCH_BUFFER_START_length * 4;
+   assert(batch->end == current_bbo->bo.map + current_bbo->bo.size);
+
+   emit_batch_buffer_start(cmd_buffer, &bbo->bo, 0);
+
+   anv_batch_bo_finish(current_bbo, batch);
+}
+
+static VkResult
+anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data)
+{
+   struct anv_cmd_buffer *cmd_buffer = _data;
+   struct anv_batch_bo *new_bbo;
+
+   VkResult result = anv_batch_bo_create(cmd_buffer, &new_bbo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct anv_batch_bo **seen_bbo = anv_vector_add(&cmd_buffer->seen_bbos);
+   if (seen_bbo == NULL) {
+      anv_batch_bo_destroy(new_bbo, cmd_buffer);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+   *seen_bbo = new_bbo;
+
+   cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo);
+
+   list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);
+
+   anv_batch_bo_start(new_bbo, batch, GEN8_MI_BATCH_BUFFER_START_length * 4);
+
+   return VK_SUCCESS;
+}
+
+struct anv_state
+anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t entries, uint32_t *state_offset)
+{
+   struct anv_block_pool *block_pool =
+       &cmd_buffer->device->surface_state_block_pool;
+   int32_t *bt_block = anv_vector_head(&cmd_buffer->bt_blocks);
+   struct anv_state state;
+
+   state.alloc_size = align_u32(entries * 4, 32);
+
+   if (cmd_buffer->bt_next + state.alloc_size > block_pool->block_size)
+      return (struct anv_state) { 0 };
+
+   state.offset = cmd_buffer->bt_next;
+   state.map = block_pool->map + *bt_block + state.offset;
+
+   cmd_buffer->bt_next += state.alloc_size;
+
+   assert(*bt_block < 0);
+   *state_offset = -(*bt_block);
+
+   return state;
+}
+
+struct anv_state
+anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer)
+{
+   return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
+}
+
+struct anv_state
+anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t size, uint32_t alignment)
+{
+   return anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
+                                 size, alignment);
+}
+
+VkResult
+anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_block_pool *block_pool =
+       &cmd_buffer->device->surface_state_block_pool;
+
+   int32_t *offset = anv_vector_add(&cmd_buffer->bt_blocks);
+   if (offset == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   *offset = anv_block_pool_alloc_back(block_pool);
+   cmd_buffer->bt_next = 0;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_batch_bo *batch_bo;
+   VkResult result;
+
+   list_inithead(&cmd_buffer->batch_bos);
+
+   result = anv_batch_bo_create(cmd_buffer, &batch_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
+
+   cmd_buffer->batch.alloc = &cmd_buffer->pool->alloc;
+   cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
+   cmd_buffer->batch.user_data = cmd_buffer;
+
+   anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
+                      GEN8_MI_BATCH_BUFFER_START_length * 4);
+
+   int success = anv_vector_init(&cmd_buffer->seen_bbos,
+                                 sizeof(struct anv_bo *),
+                                 8 * sizeof(struct anv_bo *));
+   if (!success)
+      goto fail_batch_bo;
+
+   *(struct anv_batch_bo **)anv_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
+
+   success = anv_vector_init(&cmd_buffer->bt_blocks, sizeof(int32_t),
+                             8 * sizeof(int32_t));
+   if (!success)
+      goto fail_seen_bbos;
+
+   result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
+                                &cmd_buffer->pool->alloc);
+   if (result != VK_SUCCESS)
+      goto fail_bt_blocks;
+
+   anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+
+   cmd_buffer->execbuf2.objects = NULL;
+   cmd_buffer->execbuf2.bos = NULL;
+   cmd_buffer->execbuf2.array_length = 0;
+
+   return VK_SUCCESS;
+
+ fail_bt_blocks:
+   anv_vector_finish(&cmd_buffer->bt_blocks);
+ fail_seen_bbos:
+   anv_vector_finish(&cmd_buffer->seen_bbos);
+ fail_batch_bo:
+   anv_batch_bo_destroy(batch_bo, cmd_buffer);
+
+   return result;
+}
+
+void
+anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
+{
+   int32_t *bt_block;
+   anv_vector_foreach(bt_block, &cmd_buffer->bt_blocks) {
+      anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool,
+                          *bt_block);
+   }
+   anv_vector_finish(&cmd_buffer->bt_blocks);
+
+   anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc);
+
+   anv_vector_finish(&cmd_buffer->seen_bbos);
+
+   /* Destroy all of the batch buffers */
+   list_for_each_entry_safe(struct anv_batch_bo, bbo,
+                            &cmd_buffer->batch_bos, link) {
+      anv_batch_bo_destroy(bbo, cmd_buffer);
+   }
+
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.objects);
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.bos);
+}
+
+void
+anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
+{
+   /* Delete all but the first batch bo */
+   assert(!list_empty(&cmd_buffer->batch_bos));
+   while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) {
+      struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+      list_del(&bbo->link);
+      anv_batch_bo_destroy(bbo, cmd_buffer);
+   }
+   assert(!list_empty(&cmd_buffer->batch_bos));
+
+   anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),
+                      &cmd_buffer->batch,
+                      GEN8_MI_BATCH_BUFFER_START_length * 4);
+
+   while (anv_vector_length(&cmd_buffer->bt_blocks) > 1) {
+      int32_t *bt_block = anv_vector_remove(&cmd_buffer->bt_blocks);
+      anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool,
+                          *bt_block);
+   }
+   assert(anv_vector_length(&cmd_buffer->bt_blocks) == 1);
+   cmd_buffer->bt_next = 0;
+
+   cmd_buffer->surface_relocs.num_relocs = 0;
+
+   /* Reset the list of seen buffers */
+   cmd_buffer->seen_bbos.head = 0;
+   cmd_buffer->seen_bbos.tail = 0;
+
+   *(struct anv_batch_bo **)anv_vector_add(&cmd_buffer->seen_bbos) =
+      anv_cmd_buffer_current_batch_bo(cmd_buffer);
+}
+
+void
+anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+      /* When we start a batch buffer, we subtract a certain amount of
+       * padding from the end to ensure that we always have room to emit a
+       * BATCH_BUFFER_START to chain to the next BO.  We need to remove
+       * that padding before we end the batch; otherwise, we may end up
+       * with our BATCH_BUFFER_END in another BO.
+       */
+      cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4;
+      assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size);
+
+      anv_batch_emit(&cmd_buffer->batch, GEN7_MI_BATCH_BUFFER_END);
+
+      /* Round batch up to an even number of dwords. */
+      if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
+         anv_batch_emit(&cmd_buffer->batch, GEN7_MI_NOOP);
+
+      cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
+   }
+
+   anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
+
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      /* If this is a secondary command buffer, we need to determine the
+       * mode in which it will be executed with vkExecuteCommands.  We
+       * determine this statically here so that this stays in sync with the
+       * actual ExecuteCommands implementation.
+       */
+      if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) &&
+          (batch_bo->length < ANV_CMD_BUFFER_BATCH_SIZE / 2)) {
+         /* If the secondary has exactly one batch buffer in its list *and*
+          * that batch buffer is less than half of the maximum size, we're
+          * probably better of simply copying it into our batch.
+          */
+         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT;
+      } else if (!(cmd_buffer->usage_flags &
+                   VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
+         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;
+
+         /* When we chain, we need to add an MI_BATCH_BUFFER_START command
+          * with its relocation.  In order to handle this we'll increment here
+          * so we can unconditionally decrement right before adding the
+          * MI_BATCH_BUFFER_START command.
+          */
+         batch_bo->relocs.num_relocs++;
+         cmd_buffer->batch.next += GEN8_MI_BATCH_BUFFER_START_length * 4;
+      } else {
+         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
+      }
+   }
+}
+
+static inline VkResult
+anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,
+                             struct list_head *list)
+{
+   list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
+      struct anv_batch_bo **bbo_ptr = anv_vector_add(&cmd_buffer->seen_bbos);
+      if (bbo_ptr == NULL)
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      *bbo_ptr = bbo;
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
+                             struct anv_cmd_buffer *secondary)
+{
+   switch (secondary->exec_mode) {
+   case ANV_CMD_BUFFER_EXEC_MODE_EMIT:
+      anv_batch_emit_batch(&primary->batch, &secondary->batch);
+      break;
+   case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {
+      struct anv_batch_bo *first_bbo =
+         list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
+      struct anv_batch_bo *last_bbo =
+         list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
+
+      emit_batch_buffer_start(primary, &first_bbo->bo, 0);
+
+      struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
+      assert(primary->batch.start == this_bbo->bo.map);
+      uint32_t offset = primary->batch.next - primary->batch.start;
+      const uint32_t inst_size = GEN8_MI_BATCH_BUFFER_START_length * 4;
+
+      /* Roll back the previous MI_BATCH_BUFFER_START and its relocation so we
+       * can emit a new command and relocation for the current splice.  In
+       * order to handle the initial-use case, we incremented next and
+       * num_relocs in end_batch_buffer() so we can alyways just subtract
+       * here.
+       */
+      last_bbo->relocs.num_relocs--;
+      secondary->batch.next -= inst_size;
+      emit_batch_buffer_start(secondary, &this_bbo->bo, offset);
+      anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
+
+      /* After patching up the secondary buffer, we need to clflush the
+       * modified instruction in case we're on a !llc platform. We use a
+       * little loop to handle the case where the instruction crosses a cache
+       * line boundary.
+       */
+      if (!primary->device->info.has_llc) {
+         void *inst = secondary->batch.next - inst_size;
+         void *p = (void *) (((uintptr_t) inst) & ~CACHELINE_MASK);
+         __builtin_ia32_sfence();
+         while (p < secondary->batch.next) {
+            __builtin_ia32_clflush(p);
+            p += CACHELINE_SIZE;
+         }
+      }
+
+      break;
+   }
+   case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {
+      struct list_head copy_list;
+      VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos,
+                                                secondary,
+                                                &copy_list);
+      if (result != VK_SUCCESS)
+         return; /* FIXME */
+
+      anv_cmd_buffer_add_seen_bbos(primary, &copy_list);
+
+      struct anv_batch_bo *first_bbo =
+         list_first_entry(&copy_list, struct anv_batch_bo, link);
+      struct anv_batch_bo *last_bbo =
+         list_last_entry(&copy_list, struct anv_batch_bo, link);
+
+      cmd_buffer_chain_to_batch_bo(primary, first_bbo);
+
+      list_splicetail(&copy_list, &primary->batch_bos);
+
+      anv_batch_bo_continue(last_bbo, &primary->batch,
+                            GEN8_MI_BATCH_BUFFER_START_length * 4);
+
+      anv_cmd_buffer_emit_state_base_address(primary);
+      break;
+   }
+   default:
+      assert(!"Invalid execution mode");
+   }
+
+   anv_reloc_list_append(&primary->surface_relocs, &primary->pool->alloc,
+                         &secondary->surface_relocs, 0);
+}
+
+static VkResult
+anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer,
+                      struct anv_bo *bo,
+                      struct anv_reloc_list *relocs)
+{
+   struct drm_i915_gem_exec_object2 *obj = NULL;
+
+   if (bo->index < cmd_buffer->execbuf2.bo_count &&
+       cmd_buffer->execbuf2.bos[bo->index] == bo)
+      obj = &cmd_buffer->execbuf2.objects[bo->index];
+
+   if (obj == NULL) {
+      /* We've never seen this one before.  Add it to the list and assign
+       * an id that we can use later.
+       */
+      if (cmd_buffer->execbuf2.bo_count >= cmd_buffer->execbuf2.array_length) {
+         uint32_t new_len = cmd_buffer->execbuf2.objects ?
+                            cmd_buffer->execbuf2.array_length * 2 : 64;
+
+         struct drm_i915_gem_exec_object2 *new_objects =
+            anv_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_objects),
+                      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+         if (new_objects == NULL)
+            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         struct anv_bo **new_bos =
+            anv_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_bos),
+                      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+         if (new_objects == NULL) {
+            anv_free(&cmd_buffer->pool->alloc, new_objects);
+            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+         }
+
+         if (cmd_buffer->execbuf2.objects) {
+            memcpy(new_objects, cmd_buffer->execbuf2.objects,
+                   cmd_buffer->execbuf2.bo_count * sizeof(*new_objects));
+            memcpy(new_bos, cmd_buffer->execbuf2.bos,
+                   cmd_buffer->execbuf2.bo_count * sizeof(*new_bos));
+         }
+
+         cmd_buffer->execbuf2.objects = new_objects;
+         cmd_buffer->execbuf2.bos = new_bos;
+         cmd_buffer->execbuf2.array_length = new_len;
+      }
+
+      assert(cmd_buffer->execbuf2.bo_count < cmd_buffer->execbuf2.array_length);
+
+      bo->index = cmd_buffer->execbuf2.bo_count++;
+      obj = &cmd_buffer->execbuf2.objects[bo->index];
+      cmd_buffer->execbuf2.bos[bo->index] = bo;
+
+      obj->handle = bo->gem_handle;
+      obj->relocation_count = 0;
+      obj->relocs_ptr = 0;
+      obj->alignment = 0;
+      obj->offset = bo->offset;
+      obj->flags = 0;
+      obj->rsvd1 = 0;
+      obj->rsvd2 = 0;
+   }
+
+   if (relocs != NULL && obj->relocation_count == 0) {
+      /* This is the first time we've ever seen a list of relocations for
+       * this BO.  Go ahead and set the relocations and then walk the list
+       * of relocations and add them all.
+       */
+      obj->relocation_count = relocs->num_relocs;
+      obj->relocs_ptr = (uintptr_t) relocs->relocs;
+
+      for (size_t i = 0; i < relocs->num_relocs; i++) {
+         /* A quick sanity check on relocations */
+         assert(relocs->relocs[i].offset < bo->size);
+         anv_cmd_buffer_add_bo(cmd_buffer, relocs->reloc_bos[i], NULL);
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,
+                              struct anv_reloc_list *list)
+{
+   struct anv_bo *bo;
+
+   /* If the kernel supports I915_EXEC_NO_RELOC, it will compare offset in
+    * struct drm_i915_gem_exec_object2 against the bos current offset and if
+    * all bos haven't moved it will skip relocation processing alltogether.
+    * If I915_EXEC_NO_RELOC is not supported, the kernel ignores the incoming
+    * value of offset so we can set it either way.  For that to work we need
+    * to make sure all relocs use the same presumed offset.
+    */
+
+   for (size_t i = 0; i < list->num_relocs; i++) {
+      bo = list->reloc_bos[i];
+      if (bo->offset != list->relocs[i].presumed_offset)
+         cmd_buffer->execbuf2.need_reloc = true;
+
+      list->relocs[i].target_handle = bo->index;
+   }
+}
+
+static void
+adjust_relocations_from_block_pool(struct anv_block_pool *pool,
+                                   struct anv_reloc_list *relocs)
+{
+   for (size_t i = 0; i < relocs->num_relocs; i++) {
+      /* In general, we don't know how stale the relocated value is.  It
+       * may have been used last time or it may not.  Since we don't want
+       * to stomp it while the GPU may be accessing it, we haven't updated
+       * it anywhere else in the code.  Instead, we just set the presumed
+       * offset to what it is now based on the delta and the data in the
+       * block pool.  Then the kernel will update it for us if needed.
+       */
+      assert(relocs->relocs[i].offset < pool->state.end);
+      uint32_t *reloc_data = pool->map + relocs->relocs[i].offset;
+
+      /* We're reading back the relocated value from potentially incoherent
+       * memory here. However, any change to the value will be from the kernel
+       * writing out relocations, which will keep the CPU cache up to date.
+       */
+      relocs->relocs[i].presumed_offset = *reloc_data - relocs->relocs[i].delta;
+
+      /* All of the relocations from this block pool to other BO's should
+       * have been emitted relative to the surface block pool center.  We
+       * need to add the center offset to make them relative to the
+       * beginning of the actual GEM bo.
+       */
+      relocs->relocs[i].offset += pool->center_bo_offset;
+   }
+}
+
+static void
+adjust_relocations_to_block_pool(struct anv_block_pool *pool,
+                                 struct anv_bo *from_bo,
+                                 struct anv_reloc_list *relocs,
+                                 uint32_t *last_pool_center_bo_offset)
+{
+   assert(*last_pool_center_bo_offset <= pool->center_bo_offset);
+   uint32_t delta = pool->center_bo_offset - *last_pool_center_bo_offset;
+
+   /* When we initially emit relocations into a block pool, we don't
+    * actually know what the final center_bo_offset will be so we just emit
+    * it as if center_bo_offset == 0.  Now that we know what the center
+    * offset is, we need to walk the list of relocations and adjust any
+    * relocations that point to the pool bo with the correct offset.
+    */
+   for (size_t i = 0; i < relocs->num_relocs; i++) {
+      if (relocs->reloc_bos[i] == &pool->bo) {
+         /* Adjust the delta value in the relocation to correctly
+          * correspond to the new delta.  Initially, this value may have
+          * been negative (if treated as unsigned), but we trust in
+          * uint32_t roll-over to fix that for us at this point.
+          */
+         relocs->relocs[i].delta += delta;
+
+         /* Since the delta has changed, we need to update the actual
+          * relocated value with the new presumed value.  This function
+          * should only be called on batch buffers, so we know it isn't in
+          * use by the GPU at the moment.
+          */
+         assert(relocs->relocs[i].offset < from_bo->size);
+         uint32_t *reloc_data = from_bo->map + relocs->relocs[i].offset;
+         *reloc_data = relocs->relocs[i].presumed_offset +
+                       relocs->relocs[i].delta;
+      }
+   }
+
+   *last_pool_center_bo_offset = pool->center_bo_offset;
+}
+
+void
+anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_batch *batch = &cmd_buffer->batch;
+   struct anv_block_pool *ss_pool =
+      &cmd_buffer->device->surface_state_block_pool;
+
+   cmd_buffer->execbuf2.bo_count = 0;
+   cmd_buffer->execbuf2.need_reloc = false;
+
+   adjust_relocations_from_block_pool(ss_pool, &cmd_buffer->surface_relocs);
+   anv_cmd_buffer_add_bo(cmd_buffer, &ss_pool->bo, &cmd_buffer->surface_relocs);
+
+   /* First, we walk over all of the bos we've seen and add them and their
+    * relocations to the validate list.
+    */
+   struct anv_batch_bo **bbo;
+   anv_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+      adjust_relocations_to_block_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs,
+                                       &(*bbo)->last_ss_pool_bo_offset);
+
+      anv_cmd_buffer_add_bo(cmd_buffer, &(*bbo)->bo, &(*bbo)->relocs);
+   }
+
+   struct anv_batch_bo *first_batch_bo =
+      list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
+
+   /* The kernel requires that the last entry in the validation list be the
+    * batch buffer to execute.  We can simply swap the element
+    * corresponding to the first batch_bo in the chain with the last
+    * element in the list.
+    */
+   if (first_batch_bo->bo.index != cmd_buffer->execbuf2.bo_count - 1) {
+      uint32_t idx = first_batch_bo->bo.index;
+      uint32_t last_idx = cmd_buffer->execbuf2.bo_count - 1;
+
+      struct drm_i915_gem_exec_object2 tmp_obj =
+         cmd_buffer->execbuf2.objects[idx];
+      assert(cmd_buffer->execbuf2.bos[idx] == &first_batch_bo->bo);
+
+      cmd_buffer->execbuf2.objects[idx] = cmd_buffer->execbuf2.objects[last_idx];
+      cmd_buffer->execbuf2.bos[idx] = cmd_buffer->execbuf2.bos[last_idx];
+      cmd_buffer->execbuf2.bos[idx]->index = idx;
+
+      cmd_buffer->execbuf2.objects[last_idx] = tmp_obj;
+      cmd_buffer->execbuf2.bos[last_idx] = &first_batch_bo->bo;
+      first_batch_bo->bo.index = last_idx;
+   }
+
+   /* Now we go through and fixup all of the relocation lists to point to
+    * the correct indices in the object array.  We have to do this after we
+    * reorder the list above as some of the indices may have changed.
+    */
+   anv_vector_foreach(bbo, &cmd_buffer->seen_bbos)
+      anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs);
+
+   anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs);
+
+   if (!cmd_buffer->device->info.has_llc) {
+      __builtin_ia32_sfence();
+      anv_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+         for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)
+            __builtin_ia32_clflush((*bbo)->bo.map + i);
+      }
+   }
+
+   cmd_buffer->execbuf2.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) cmd_buffer->execbuf2.objects,
+      .buffer_count = cmd_buffer->execbuf2.bo_count,
+      .batch_start_offset = 0,
+      .batch_len = batch->next - batch->start,
+      .cliprects_ptr = 0,
+      .num_cliprects = 0,
+      .DR1 = 0,
+      .DR4 = 0,
+      .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER |
+               I915_EXEC_CONSTANTS_REL_GENERAL,
+      .rsvd1 = cmd_buffer->device->context_id,
+      .rsvd2 = 0,
+   };
+
+   if (!cmd_buffer->execbuf2.need_reloc)
+      cmd_buffer->execbuf2.execbuf.flags |= I915_EXEC_NO_RELOC;
+}
diff --git a/src/vulkan/anv_cmd_buffer.c b/src/vulkan/anv_cmd_buffer.c

new file mode 100644 (file)

index 0000000..070b849
--- /dev/null
+++ b/src/vulkan/anv_cmd_buffer.c
@@ -0,0 +1,1198 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+/** \file anv_cmd_buffer.c
+ *
+ * This file contains all of the stuff for emitting commands into a command
+ * buffer.  This includes implementations of most of the vkCmd*
+ * entrypoints.  This file is concerned entirely with state emission and
+ * not with the command buffer data structure itself.  As far as this file
+ * is concerned, most of anv_cmd_buffer is magic.
+ */
+
+/* TODO: These are taken from GLES.  We should check the Vulkan spec */
+const struct anv_dynamic_state default_dynamic_state = {
+   .viewport = {
+      .count = 0,
+   },
+   .scissor = {
+      .count = 0,
+   },
+   .line_width = 1.0f,
+   .depth_bias = {
+      .bias = 0.0f,
+      .clamp = 0.0f,
+      .slope = 0.0f,
+   },
+   .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
+   .depth_bounds = {
+      .min = 0.0f,
+      .max = 1.0f,
+   },
+   .stencil_compare_mask = {
+      .front = ~0u,
+      .back = ~0u,
+   },
+   .stencil_write_mask = {
+      .front = ~0u,
+      .back = ~0u,
+   },
+   .stencil_reference = {
+      .front = 0u,
+      .back = 0u,
+   },
+};
+
+void
+anv_dynamic_state_copy(struct anv_dynamic_state *dest,
+                       const struct anv_dynamic_state *src,
+                       uint32_t copy_mask)
+{
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_VIEWPORT)) {
+      dest->viewport.count = src->viewport.count;
+      typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
+                   src->viewport.count);
+   }
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_SCISSOR)) {
+      dest->scissor.count = src->scissor.count;
+      typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
+                   src->scissor.count);
+   }
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_LINE_WIDTH))
+      dest->line_width = src->line_width;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_DEPTH_BIAS))
+      dest->depth_bias = src->depth_bias;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_BLEND_CONSTANTS))
+      typed_memcpy(dest->blend_constants, src->blend_constants, 4);
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_DEPTH_BOUNDS))
+      dest->depth_bounds = src->depth_bounds;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK))
+      dest->stencil_compare_mask = src->stencil_compare_mask;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_WRITE_MASK))
+      dest->stencil_write_mask = src->stencil_write_mask;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_REFERENCE))
+      dest->stencil_reference = src->stencil_reference;
+}
+
+static void
+anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_state *state = &cmd_buffer->state;
+
+   memset(&state->descriptors, 0, sizeof(state->descriptors));
+   memset(&state->push_constants, 0, sizeof(state->push_constants));
+
+   state->dirty = ~0;
+   state->vb_dirty = 0;
+   state->descriptors_dirty = 0;
+   state->push_constants_dirty = 0;
+   state->pipeline = NULL;
+   state->restart_index = UINT32_MAX;
+   state->dynamic = default_dynamic_state;
+   state->need_query_wa = true;
+
+   if (state->attachments != NULL) {
+      anv_free(&cmd_buffer->pool->alloc, state->attachments);
+      state->attachments = NULL;
+   }
+
+   state->gen7.index_buffer = NULL;
+}
+
+/**
+ * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.
+ */
+void
+anv_cmd_state_setup_attachments(struct anv_cmd_buffer *cmd_buffer,
+                                const VkRenderPassBeginInfo *info)
+{
+   struct anv_cmd_state *state = &cmd_buffer->state;
+   ANV_FROM_HANDLE(anv_render_pass, pass, info->renderPass);
+
+   anv_free(&cmd_buffer->pool->alloc, state->attachments);
+
+   if (pass->attachment_count == 0) {
+      state->attachments = NULL;
+      return;
+   }
+
+   state->attachments = anv_alloc(&cmd_buffer->pool->alloc,
+                                  pass->attachment_count *
+                                       sizeof(state->attachments[0]),
+                                  8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (state->attachments == NULL) {
+      /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
+      abort();
+   }
+
+   for (uint32_t i = 0; i < pass->attachment_count; ++i) {
+      struct anv_render_pass_attachment *att = &pass->attachments[i];
+      VkImageAspectFlags clear_aspects = 0;
+
+      if (anv_format_is_color(att->format)) {
+         /* color attachment */
+         if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+            clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
+         }
+      } else {
+         /* depthstencil attachment */
+         if (att->format->depth_format &&
+             att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+            clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+         }
+         if (att->format->has_stencil &&
+             att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+            clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+         }
+      }
+
+      state->attachments[i].pending_clear_aspects = clear_aspects;
+      if (clear_aspects) {
+         assert(info->clearValueCount > i);
+         state->attachments[i].clear_value = info->pClearValues[i];
+      }
+   }
+}
+
+static VkResult
+anv_cmd_buffer_ensure_push_constants_size(struct anv_cmd_buffer *cmd_buffer,
+                                          gl_shader_stage stage, uint32_t size)
+{
+   struct anv_push_constants **ptr = &cmd_buffer->state.push_constants[stage];
+
+   if (*ptr == NULL) {
+      *ptr = anv_alloc(&cmd_buffer->pool->alloc, size, 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (*ptr == NULL)
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   } else if ((*ptr)->size < size) {
+      *ptr = anv_realloc(&cmd_buffer->pool->alloc, *ptr, size, 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (*ptr == NULL)
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+   (*ptr)->size = size;
+
+   return VK_SUCCESS;
+}
+
+#define anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, field) \
+   anv_cmd_buffer_ensure_push_constants_size(cmd_buffer, stage, \
+      (offsetof(struct anv_push_constants, field) + \
+       sizeof(cmd_buffer->state.push_constants[0]->field)))
+
+static VkResult anv_create_cmd_buffer(
+    struct anv_device *                         device,
+    struct anv_cmd_pool *                       pool,
+    VkCommandBufferLevel                        level,
+    VkCommandBuffer*                            pCommandBuffer)
+{
+   struct anv_cmd_buffer *cmd_buffer;
+   VkResult result;
+
+   cmd_buffer = anv_alloc(&pool->alloc, sizeof(*cmd_buffer), 8,
+                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (cmd_buffer == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+   cmd_buffer->device = device;
+   cmd_buffer->pool = pool;
+   cmd_buffer->level = level;
+   cmd_buffer->state.attachments = NULL;
+
+   result = anv_cmd_buffer_init_batch_bo_chain(cmd_buffer);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   anv_state_stream_init(&cmd_buffer->surface_state_stream,
+                         &device->surface_state_block_pool);
+   anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
+                         &device->dynamic_state_block_pool);
+
+   if (pool) {
+      list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
+   } else {
+      /* Init the pool_link so we can safefly call list_del when we destroy
+       * the command buffer
+       */
+      list_inithead(&cmd_buffer->pool_link);
+   }
+
+   *pCommandBuffer = anv_cmd_buffer_to_handle(cmd_buffer);
+
+   return VK_SUCCESS;
+
+ fail:
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer);
+
+   return result;
+}
+
+VkResult anv_AllocateCommandBuffers(
+    VkDevice                                    _device,
+    const VkCommandBufferAllocateInfo*          pAllocateInfo,
+    VkCommandBuffer*                            pCommandBuffers)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_cmd_pool, pool, pAllocateInfo->commandPool);
+
+   VkResult result = VK_SUCCESS;
+   uint32_t i;
+
+   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
+      result = anv_create_cmd_buffer(device, pool, pAllocateInfo->level,
+                                     &pCommandBuffers[i]);
+      if (result != VK_SUCCESS)
+         break;
+   }
+
+   if (result != VK_SUCCESS)
+      anv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
+                             i, pCommandBuffers);
+
+   return result;
+}
+
+static void
+anv_cmd_buffer_destroy(struct anv_cmd_buffer *cmd_buffer)
+{
+   list_del(&cmd_buffer->pool_link);
+
+   anv_cmd_buffer_fini_batch_bo_chain(cmd_buffer);
+
+   anv_state_stream_finish(&cmd_buffer->surface_state_stream);
+   anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
+
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer);
+}
+
+void anv_FreeCommandBuffers(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCommandBuffers)
+{
+   for (uint32_t i = 0; i < commandBufferCount; i++) {
+      ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
+
+      anv_cmd_buffer_destroy(cmd_buffer);
+   }
+}
+
+VkResult anv_ResetCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkCommandBufferResetFlags                   flags)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->usage_flags = 0;
+   cmd_buffer->state.current_pipeline = UINT32_MAX;
+   anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer);
+   anv_cmd_state_reset(cmd_buffer);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)
+{
+   switch (cmd_buffer->device->info.gen) {
+   case 7:
+      if (cmd_buffer->device->info.is_haswell)
+         return gen7_cmd_buffer_emit_state_base_address(cmd_buffer);
+      else
+         return gen7_cmd_buffer_emit_state_base_address(cmd_buffer);
+   case 8:
+      return gen8_cmd_buffer_emit_state_base_address(cmd_buffer);
+   case 9:
+      return gen9_cmd_buffer_emit_state_base_address(cmd_buffer);
+   default:
+      unreachable("unsupported gen\n");
+   }
+}
+
+VkResult anv_BeginCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    const VkCommandBufferBeginInfo*             pBeginInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   /* If this is the first vkBeginCommandBuffer, we must *initialize* the
+    * command buffer's state. Otherwise, we must *reset* its state. In both
+    * cases we reset it.
+    *
+    * From the Vulkan 1.0 spec:
+    *
+    *    If a command buffer is in the executable state and the command buffer
+    *    was allocated from a command pool with the
+    *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
+    *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
+    *    as if vkResetCommandBuffer had been called with
+    *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
+    *    the command buffer in the recording state.
+    */
+   anv_ResetCommandBuffer(commandBuffer, /*flags*/ 0);
+
+   cmd_buffer->usage_flags = pBeginInfo->flags;
+
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
+          !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
+
+   if (cmd_buffer->usage_flags &
+       VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+      cmd_buffer->state.framebuffer =
+         anv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
+      cmd_buffer->state.pass =
+         anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
+
+      struct anv_subpass *subpass =
+         &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
+
+      anv_cmd_buffer_set_subpass(cmd_buffer, subpass);
+   }
+
+   anv_cmd_buffer_emit_state_base_address(cmd_buffer);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_EndCommandBuffer(
+    VkCommandBuffer                             commandBuffer)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_device *device = cmd_buffer->device;
+
+   anv_cmd_buffer_end_batch_buffer(cmd_buffer);
+
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+      /* The algorithm used to compute the validate list is not threadsafe as
+       * it uses the bo->index field.  We have to lock the device around it.
+       * Fortunately, the chances for contention here are probably very low.
+       */
+      pthread_mutex_lock(&device->mutex);
+      anv_cmd_buffer_prepare_execbuf(cmd_buffer);
+      pthread_mutex_unlock(&device->mutex);
+   }
+
+   return VK_SUCCESS;
+}
+
+void anv_CmdBindPipeline(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipeline                                  _pipeline)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+
+   switch (pipelineBindPoint) {
+   case VK_PIPELINE_BIND_POINT_COMPUTE:
+      cmd_buffer->state.compute_pipeline = pipeline;
+      cmd_buffer->state.compute_dirty |= ANV_CMD_DIRTY_PIPELINE;
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+      break;
+
+   case VK_PIPELINE_BIND_POINT_GRAPHICS:
+      cmd_buffer->state.pipeline = pipeline;
+      cmd_buffer->state.vb_dirty |= pipeline->vb_used;
+      cmd_buffer->state.dirty |= ANV_CMD_DIRTY_PIPELINE;
+      cmd_buffer->state.push_constants_dirty |= pipeline->active_stages;
+
+      /* Apply the dynamic state from the pipeline */
+      cmd_buffer->state.dirty |= pipeline->dynamic_state_mask;
+      anv_dynamic_state_copy(&cmd_buffer->state.dynamic,
+                             &pipeline->dynamic_state,
+                             pipeline->dynamic_state_mask);
+      break;
+
+   default:
+      assert(!"invalid bind point");
+      break;
+   }
+}
+
+void anv_CmdSetViewport(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstViewport,
+    uint32_t                                    viewportCount,
+    const VkViewport*                           pViewports)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   const uint32_t total_count = firstViewport + viewportCount;
+   if (cmd_buffer->state.dynamic.viewport.count < total_count);
+      cmd_buffer->state.dynamic.viewport.count = total_count;
+
+   memcpy(cmd_buffer->state.dynamic.viewport.viewports + firstViewport,
+          pViewports, viewportCount * sizeof(*pViewports));
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
+}
+
+void anv_CmdSetScissor(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstScissor,
+    uint32_t                                    scissorCount,
+    const VkRect2D*                             pScissors)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   const uint32_t total_count = firstScissor + scissorCount;
+   if (cmd_buffer->state.dynamic.scissor.count < total_count);
+      cmd_buffer->state.dynamic.scissor.count = total_count;
+
+   memcpy(cmd_buffer->state.dynamic.scissor.scissors + firstScissor,
+          pScissors, scissorCount * sizeof(*pScissors));
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
+}
+
+void anv_CmdSetLineWidth(
+    VkCommandBuffer                             commandBuffer,
+    float                                       lineWidth)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.dynamic.line_width = lineWidth;
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
+}
+
+void anv_CmdSetDepthBias(
+    VkCommandBuffer                             commandBuffer,
+    float                                       depthBiasConstantFactor,
+    float                                       depthBiasClamp,
+    float                                       depthBiasSlopeFactor)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor;
+   cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp;
+   cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
+}
+
+void anv_CmdSetBlendConstants(
+    VkCommandBuffer                             commandBuffer,
+    const float                                 blendConstants[4])
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   memcpy(cmd_buffer->state.dynamic.blend_constants,
+          blendConstants, sizeof(float) * 4);
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
+}
+
+void anv_CmdSetDepthBounds(
+    VkCommandBuffer                             commandBuffer,
+    float                                       minDepthBounds,
+    float                                       maxDepthBounds)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
+   cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
+}
+
+void anv_CmdSetStencilCompareMask(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    compareMask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+      cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask;
+   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+      cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
+}
+
+void anv_CmdSetStencilWriteMask(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    writeMask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+      cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask;
+   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+      cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
+}
+
+void anv_CmdSetStencilReference(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    reference)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+      cmd_buffer->state.dynamic.stencil_reference.front = reference;
+   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+      cmd_buffer->state.dynamic.stencil_reference.back = reference;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
+}
+
+void anv_CmdBindDescriptorSets(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipelineLayout                            _layout,
+    uint32_t                                    firstSet,
+    uint32_t                                    descriptorSetCount,
+    const VkDescriptorSet*                      pDescriptorSets,
+    uint32_t                                    dynamicOffsetCount,
+    const uint32_t*                             pDynamicOffsets)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
+   struct anv_descriptor_set_layout *set_layout;
+
+   assert(firstSet + descriptorSetCount < MAX_SETS);
+
+   uint32_t dynamic_slot = 0;
+   for (uint32_t i = 0; i < descriptorSetCount; i++) {
+      ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]);
+      set_layout = layout->set[firstSet + i].layout;
+
+      if (cmd_buffer->state.descriptors[firstSet + i] != set) {
+         cmd_buffer->state.descriptors[firstSet + i] = set;
+         cmd_buffer->state.descriptors_dirty |= set_layout->shader_stages;
+      }
+
+      if (set_layout->dynamic_offset_count > 0) {
+         anv_foreach_stage(s, set_layout->shader_stages) {
+            anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, s, dynamic);
+
+            struct anv_push_constants *push =
+               cmd_buffer->state.push_constants[s];
+
+            unsigned d = layout->set[firstSet + i].dynamic_offset_start;
+            const uint32_t *offsets = pDynamicOffsets + dynamic_slot;
+            struct anv_descriptor *desc = set->descriptors;
+
+            for (unsigned b = 0; b < set_layout->binding_count; b++) {
+               if (set_layout->binding[b].dynamic_offset_index < 0)
+                  continue;
+
+               unsigned array_size = set_layout->binding[b].array_size;
+               for (unsigned j = 0; j < array_size; j++) {
+                  uint32_t range = 0;
+                  if (desc->buffer_view)
+                     range = desc->buffer_view->range;
+                  push->dynamic[d].offset = *(offsets++);
+                  push->dynamic[d].range = range;
+                  desc++;
+                  d++;
+               }
+            }
+         }
+         cmd_buffer->state.push_constants_dirty |= set_layout->shader_stages;
+      }
+   }
+}
+
+void anv_CmdBindVertexBuffers(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstBinding,
+    uint32_t                                    bindingCount,
+    const VkBuffer*                             pBuffers,
+    const VkDeviceSize*                         pOffsets)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
+
+   /* We have to defer setting up vertex buffer since we need the buffer
+    * stride from the pipeline. */
+
+   assert(firstBinding + bindingCount < MAX_VBS);
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      vb[firstBinding + i].buffer = anv_buffer_from_handle(pBuffers[i]);
+      vb[firstBinding + i].offset = pOffsets[i];
+      cmd_buffer->state.vb_dirty |= 1 << (firstBinding + i);
+   }
+}
+
+static void
+add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer,
+                        struct anv_state state, struct anv_bo *bo, uint32_t offset)
+{
+   /* The address goes in SURFACE_STATE dword 1 for gens < 8 and dwords 8 and
+    * 9 for gen8+.  We only write the first dword for gen8+ here and rely on
+    * the initial state to set the high bits to 0. */
+
+   const uint32_t dword = cmd_buffer->device->info.gen < 8 ? 1 : 8;
+
+   anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
+                      state.offset + dword * 4, bo, offset);
+}
+
+const struct anv_format *
+anv_format_for_descriptor_type(VkDescriptorType type)
+{
+   switch (type) {
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      return anv_format_for_vk_format(VK_FORMAT_R32G32B32A32_SFLOAT);
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      return anv_format_for_vk_format(VK_FORMAT_UNDEFINED);
+
+   default:
+      unreachable("Invalid descriptor type");
+   }
+}
+
+VkResult
+anv_cmd_buffer_emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                  gl_shader_stage stage,
+                                  struct anv_state *bt_state)
+{
+   struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+   struct anv_subpass *subpass = cmd_buffer->state.subpass;
+   struct anv_pipeline_layout *layout;
+   uint32_t color_count, bias, state_offset;
+
+   switch (stage) {
+   case  MESA_SHADER_FRAGMENT:
+      layout = cmd_buffer->state.pipeline->layout;
+      bias = MAX_RTS;
+      color_count = subpass->color_count;
+      break;
+   case  MESA_SHADER_COMPUTE:
+      layout = cmd_buffer->state.compute_pipeline->layout;
+      bias = 1;
+      color_count = 0;
+      break;
+   default:
+      layout = cmd_buffer->state.pipeline->layout;
+      bias = 0;
+      color_count = 0;
+      break;
+   }
+
+   /* This is a little awkward: layout can be NULL but we still have to
+    * allocate and set a binding table for the PS stage for render
+    * targets. */
+   uint32_t surface_count = layout ? layout->stage[stage].surface_count : 0;
+
+   if (color_count + surface_count == 0) {
+      *bt_state = (struct anv_state) { 0, };
+      return VK_SUCCESS;
+   }
+
+   *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
+                                                  bias + surface_count,
+                                                  &state_offset);
+   uint32_t *bt_map = bt_state->map;
+
+   if (bt_state->map == NULL)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   for (uint32_t a = 0; a < color_count; a++) {
+      const struct anv_image_view *iview =
+         fb->attachments[subpass->color_attachments[a]];
+
+      assert(iview->color_rt_surface_state.alloc_size);
+      bt_map[a] = iview->color_rt_surface_state.offset + state_offset;
+      add_surface_state_reloc(cmd_buffer, iview->color_rt_surface_state,
+                              iview->bo, iview->offset);
+   }
+
+   if (stage == MESA_SHADER_COMPUTE &&
+       cmd_buffer->state.compute_pipeline->cs_prog_data.uses_num_work_groups) {
+      struct anv_bo *bo = cmd_buffer->state.num_workgroups_bo;
+      uint32_t bo_offset = cmd_buffer->state.num_workgroups_offset;
+
+      struct anv_state surface_state;
+      surface_state =
+         anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+
+      const struct anv_format *format =
+         anv_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+      anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map,
+                                    format->surface_format, bo_offset, 12, 1);
+
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(surface_state);
+
+      bt_map[0] = surface_state.offset + state_offset;
+      add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset);
+   }
+
+   if (layout == NULL)
+      goto out;
+
+   if (layout->stage[stage].image_count > 0) {
+      VkResult result =
+         anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images);
+      if (result != VK_SUCCESS)
+         return result;
+
+      cmd_buffer->state.push_constants_dirty |= 1 << stage;
+   }
+
+   uint32_t image = 0;
+   for (uint32_t s = 0; s < layout->stage[stage].surface_count; s++) {
+      struct anv_pipeline_binding *binding =
+         &layout->stage[stage].surface_to_descriptor[s];
+      struct anv_descriptor_set *set =
+         cmd_buffer->state.descriptors[binding->set];
+      struct anv_descriptor *desc = &set->descriptors[binding->offset];
+
+      struct anv_state surface_state;
+      struct anv_bo *bo;
+      uint32_t bo_offset;
+
+      switch (desc->type) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+         /* Nothing for us to do here */
+         continue;
+
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         surface_state = desc->image_view->nonrt_surface_state;
+         assert(surface_state.alloc_size);
+         bo = desc->image_view->bo;
+         bo_offset = desc->image_view->offset;
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
+         surface_state = desc->image_view->storage_surface_state;
+         assert(surface_state.alloc_size);
+         bo = desc->image_view->bo;
+         bo_offset = desc->image_view->offset;
+
+         struct brw_image_param *image_param =
+            &cmd_buffer->state.push_constants[stage]->images[image++];
+
+         anv_image_view_fill_image_param(cmd_buffer->device, desc->image_view,
+                                         image_param);
+         image_param->surface_idx = bias + s;
+         break;
+      }
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+         surface_state = desc->buffer_view->surface_state;
+         assert(surface_state.alloc_size);
+         bo = desc->buffer_view->bo;
+         bo_offset = desc->buffer_view->offset;
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         surface_state = desc->buffer_view->storage_surface_state;
+         assert(surface_state.alloc_size);
+         bo = desc->buffer_view->bo;
+         bo_offset = desc->buffer_view->offset;
+
+         struct brw_image_param *image_param =
+            &cmd_buffer->state.push_constants[stage]->images[image++];
+
+         anv_buffer_view_fill_image_param(cmd_buffer->device, desc->buffer_view,
+                                          image_param);
+         image_param->surface_idx = bias + s;
+         break;
+
+      default:
+         assert(!"Invalid descriptor type");
+         continue;
+      }
+
+      bt_map[bias + s] = surface_state.offset + state_offset;
+      add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset);
+   }
+   assert(image == layout->stage[stage].image_count);
+
+ out:
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(*bt_state);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_cmd_buffer_emit_samplers(struct anv_cmd_buffer *cmd_buffer,
+                             gl_shader_stage stage, struct anv_state *state)
+{
+   struct anv_pipeline_layout *layout;
+   uint32_t sampler_count;
+
+   if (stage == MESA_SHADER_COMPUTE)
+      layout = cmd_buffer->state.compute_pipeline->layout;
+   else
+      layout = cmd_buffer->state.pipeline->layout;
+
+   sampler_count = layout ? layout->stage[stage].sampler_count : 0;
+   if (sampler_count == 0) {
+      *state = (struct anv_state) { 0, };
+      return VK_SUCCESS;
+   }
+
+   uint32_t size = sampler_count * 16;
+   *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
+
+   if (state->map == NULL)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   for (uint32_t s = 0; s < layout->stage[stage].sampler_count; s++) {
+      struct anv_pipeline_binding *binding =
+         &layout->stage[stage].sampler_to_descriptor[s];
+      struct anv_descriptor_set *set =
+         cmd_buffer->state.descriptors[binding->set];
+      struct anv_descriptor *desc = &set->descriptors[binding->offset];
+
+      if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
+          desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         continue;
+
+      struct anv_sampler *sampler = desc->sampler;
+
+      /* This can happen if we have an unfilled slot since TYPE_SAMPLER
+       * happens to be zero.
+       */
+      if (sampler == NULL)
+         continue;
+
+      memcpy(state->map + (s * 16),
+             sampler->state, sizeof(sampler->state));
+   }
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(*state);
+
+   return VK_SUCCESS;
+}
+
+struct anv_state
+anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer,
+                            const void *data, uint32_t size, uint32_t alignment)
+{
+   struct anv_state state;
+
+   state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, alignment);
+   memcpy(state.map, data, size);
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
+
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(state.map, size));
+
+   return state;
+}
+
+struct anv_state
+anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer,
+                             uint32_t *a, uint32_t *b,
+                             uint32_t dwords, uint32_t alignment)
+{
+   struct anv_state state;
+   uint32_t *p;
+
+   state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                              dwords * 4, alignment);
+   p = state.map;
+   for (uint32_t i = 0; i < dwords; i++)
+      p[i] = a[i] | b[i];
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
+
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(p, dwords * 4));
+
+   return state;
+}
+
+/**
+ * @brief Setup the command buffer for recording commands inside the given
+ * subpass.
+ *
+ * This does not record all commands needed for starting the subpass.
+ * Starting the subpass may require additional commands.
+ *
+ * Note that vkCmdBeginRenderPass, vkCmdNextSubpass, and vkBeginCommandBuffer
+ * with VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT, all setup the
+ * command buffer for recording commands for some subpass.  But only the first
+ * two, vkCmdBeginRenderPass and vkCmdNextSubpass, can start a subpass.
+ */
+void
+anv_cmd_buffer_set_subpass(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_subpass *subpass)
+{
+   switch (cmd_buffer->device->info.gen) {
+   case 7:
+      gen7_cmd_buffer_set_subpass(cmd_buffer, subpass);
+      break;
+   case 8:
+      gen8_cmd_buffer_set_subpass(cmd_buffer, subpass);
+      break;
+   case 9:
+      gen9_cmd_buffer_set_subpass(cmd_buffer, subpass);
+      break;
+   default:
+      unreachable("unsupported gen\n");
+   }
+}
+
+struct anv_state
+anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
+                              gl_shader_stage stage)
+{
+   struct anv_push_constants *data =
+      cmd_buffer->state.push_constants[stage];
+   struct brw_stage_prog_data *prog_data =
+      cmd_buffer->state.pipeline->prog_data[stage];
+
+   /* If we don't actually have any push constants, bail. */
+   if (data == NULL || prog_data->nr_params == 0)
+      return (struct anv_state) { .offset = 0 };
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         prog_data->nr_params * sizeof(float),
+                                         32 /* bottom 5 bits MBZ */);
+
+   /* Walk through the param array and fill the buffer with data */
+   uint32_t *u32_map = state.map;
+   for (unsigned i = 0; i < prog_data->nr_params; i++) {
+      uint32_t offset = (uintptr_t)prog_data->param[i];
+      u32_map[i] = *(uint32_t *)((uint8_t *)data + offset);
+   }
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
+
+   return state;
+}
+
+struct anv_state
+anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_push_constants *data =
+      cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE];
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   const struct brw_cs_prog_data *cs_prog_data = &pipeline->cs_prog_data;
+   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+
+   const unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
+   const unsigned push_constant_data_size =
+      (local_id_dwords + prog_data->nr_params) * 4;
+   const unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
+   const unsigned param_aligned_count =
+      reg_aligned_constant_size / sizeof(uint32_t);
+
+   /* If we don't actually have any push constants, bail. */
+   if (reg_aligned_constant_size == 0)
+      return (struct anv_state) { .offset = 0 };
+
+   const unsigned threads = pipeline->cs_thread_width_max;
+   const unsigned total_push_constants_size =
+      reg_aligned_constant_size * threads;
+   const unsigned push_constant_alignment =
+      cmd_buffer->device->info.gen < 8 ? 32 : 64;
+   const unsigned aligned_total_push_constants_size =
+      ALIGN(total_push_constants_size, push_constant_alignment);
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         aligned_total_push_constants_size,
+                                         push_constant_alignment);
+
+   /* Walk through the param array and fill the buffer with data */
+   uint32_t *u32_map = state.map;
+
+   brw_cs_fill_local_id_payload(cs_prog_data, u32_map, threads,
+                                reg_aligned_constant_size);
+
+   /* Setup uniform data for the first thread */
+   for (unsigned i = 0; i < prog_data->nr_params; i++) {
+      uint32_t offset = (uintptr_t)prog_data->param[i];
+      u32_map[local_id_dwords + i] = *(uint32_t *)((uint8_t *)data + offset);
+   }
+
+   /* Copy uniform data from the first thread to every other thread */
+   const size_t uniform_data_size = prog_data->nr_params * sizeof(uint32_t);
+   for (unsigned t = 1; t < threads; t++) {
+      memcpy(&u32_map[t * param_aligned_count + local_id_dwords],
+             &u32_map[local_id_dwords],
+             uniform_data_size);
+   }
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
+
+   return state;
+}
+
+void anv_CmdPushConstants(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineLayout                            layout,
+    VkShaderStageFlags                          stageFlags,
+    uint32_t                                    offset,
+    uint32_t                                    size,
+    const void*                                 pValues)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   anv_foreach_stage(stage, stageFlags) {
+      anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, client_data);
+
+      memcpy(cmd_buffer->state.push_constants[stage]->client_data + offset,
+             pValues, size);
+   }
+
+   cmd_buffer->state.push_constants_dirty |= stageFlags;
+}
+
+void anv_CmdExecuteCommands(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCmdBuffers)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
+
+   assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
+   anv_assert(primary->state.subpass == &primary->state.pass->subpasses[0]);
+
+   for (uint32_t i = 0; i < commandBufferCount; i++) {
+      ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
+
+      assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+
+      anv_cmd_buffer_add_secondary(primary, secondary);
+   }
+}
+
+VkResult anv_CreateCommandPool(
+    VkDevice                                    _device,
+    const VkCommandPoolCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkCommandPool*                              pCmdPool)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_cmd_pool *pool;
+
+   pool = anv_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pool == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   if (pAllocator)
+      pool->alloc = *pAllocator;
+   else
+      pool->alloc = device->alloc;
+
+   list_inithead(&pool->cmd_buffers);
+
+   *pCmdPool = anv_cmd_pool_to_handle(pool);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyCommandPool(
+    VkDevice                                    _device,
+    VkCommandPool                               commandPool,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_cmd_pool, pool, commandPool);
+
+   anv_ResetCommandPool(_device, commandPool, 0);
+
+   anv_free2(&device->alloc, pAllocator, pool);
+}
+
+VkResult anv_ResetCommandPool(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    VkCommandPoolResetFlags                     flags)
+{
+   ANV_FROM_HANDLE(anv_cmd_pool, pool, commandPool);
+
+   /* FIXME: vkResetCommandPool must not destroy its command buffers. The
+    * Vulkan 1.0 spec requires that it only reset them:
+    *
+    *    Resetting a command pool recycles all of the resources from all of
+    *    the command buffers allocated from the command pool back to the
+    *    command pool. All command buffers that have been allocated from the
+    *    command pool are put in the initial state.
+    */
+   list_for_each_entry_safe(struct anv_cmd_buffer, cmd_buffer,
+                            &pool->cmd_buffers, pool_link) {
+      anv_cmd_buffer_destroy(cmd_buffer);
+   }
+
+   return VK_SUCCESS;
+}
+
+/**
+ * Return NULL if the current subpass has no depthstencil attachment.
+ */
+const struct anv_image_view *
+anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct anv_subpass *subpass = cmd_buffer->state.subpass;
+   const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+
+   if (subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
+      return NULL;
+
+   const struct anv_image_view *iview =
+      fb->attachments[subpass->depth_stencil_attachment];
+
+   assert(iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                                VK_IMAGE_ASPECT_STENCIL_BIT));
+
+   return iview;
+}
diff --git a/src/vulkan/anv_descriptor_set.c b/src/vulkan/anv_descriptor_set.c

new file mode 100644 (file)

index 0000000..8b3b5df
--- /dev/null
+++ b/src/vulkan/anv_descriptor_set.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+/*
+ * Descriptor set layouts.
+ */
+
+VkResult anv_CreateDescriptorSetLayout(
+    VkDevice                                    _device,
+    const VkDescriptorSetLayoutCreateInfo*      pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorSetLayout*                      pSetLayout)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_descriptor_set_layout *set_layout;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO);
+
+   uint32_t max_binding = 0;
+   uint32_t immutable_sampler_count = 0;
+   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
+      max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding);
+      if (pCreateInfo->pBindings[j].pImmutableSamplers)
+         immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
+   }
+
+   size_t size = sizeof(struct anv_descriptor_set_layout) +
+                 (max_binding + 1) * sizeof(set_layout->binding[0]) +
+                 immutable_sampler_count * sizeof(struct anv_sampler *);
+
+   set_layout = anv_alloc2(&device->alloc, pAllocator, size, 8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!set_layout)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* We just allocate all the samplers at the end of the struct */
+   struct anv_sampler **samplers =
+      (struct anv_sampler **)&set_layout->binding[max_binding + 1];
+
+   set_layout->binding_count = max_binding + 1;
+   set_layout->shader_stages = 0;
+   set_layout->size = 0;
+
+   for (uint32_t b = 0; b <= max_binding; b++) {
+      /* Initialize all binding_layout entries to -1 */
+      memset(&set_layout->binding[b], -1, sizeof(set_layout->binding[b]));
+
+      set_layout->binding[b].immutable_samplers = NULL;
+   }
+
+   /* Initialize all samplers to 0 */
+   memset(samplers, 0, immutable_sampler_count * sizeof(*samplers));
+
+   uint32_t sampler_count[MESA_SHADER_STAGES] = { 0, };
+   uint32_t surface_count[MESA_SHADER_STAGES] = { 0, };
+   uint32_t image_count[MESA_SHADER_STAGES] = { 0, };
+   uint32_t buffer_count = 0;
+   uint32_t dynamic_offset_count = 0;
+
+   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
+      uint32_t b = binding->binding;
+
+      assert(binding->descriptorCount > 0);
+      set_layout->binding[b].array_size = binding->descriptorCount;
+      set_layout->binding[b].descriptor_index = set_layout->size;
+      set_layout->size += binding->descriptorCount;
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+         anv_foreach_stage(s, binding->stageFlags) {
+            set_layout->binding[b].stage[s].sampler_index = sampler_count[s];
+            sampler_count[s] += binding->descriptorCount;
+         }
+         break;
+      default:
+         break;
+      }
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         set_layout->binding[b].buffer_index = buffer_count;
+         buffer_count += binding->descriptorCount;
+         /* fall through */
+
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         anv_foreach_stage(s, binding->stageFlags) {
+            set_layout->binding[b].stage[s].surface_index = surface_count[s];
+            surface_count[s] += binding->descriptorCount;
+         }
+         break;
+      default:
+         break;
+      }
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         set_layout->binding[b].dynamic_offset_index = dynamic_offset_count;
+         dynamic_offset_count += binding->descriptorCount;
+         break;
+      default:
+         break;
+      }
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         anv_foreach_stage(s, binding->stageFlags) {
+            set_layout->binding[b].stage[s].image_index = image_count[s];
+            image_count[s] += binding->descriptorCount;
+         }
+         break;
+      default:
+         break;
+      }
+
+      if (binding->pImmutableSamplers) {
+         set_layout->binding[b].immutable_samplers = samplers;
+         samplers += binding->descriptorCount;
+
+         for (uint32_t i = 0; i < binding->descriptorCount; i++)
+            set_layout->binding[b].immutable_samplers[i] =
+               anv_sampler_from_handle(binding->pImmutableSamplers[i]);
+      } else {
+         set_layout->binding[b].immutable_samplers = NULL;
+      }
+
+      set_layout->shader_stages |= binding->stageFlags;
+   }
+
+   set_layout->buffer_count = buffer_count;
+   set_layout->dynamic_offset_count = dynamic_offset_count;
+
+   *pSetLayout = anv_descriptor_set_layout_to_handle(set_layout);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyDescriptorSetLayout(
+    VkDevice                                    _device,
+    VkDescriptorSetLayout                       _set_layout,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout, _set_layout);
+
+   anv_free2(&device->alloc, pAllocator, set_layout);
+}
+
+/*
+ * Pipeline layouts.  These have nothing to do with the pipeline.  They are
+ * just muttiple descriptor set layouts pasted together
+ */
+
+VkResult anv_CreatePipelineLayout(
+    VkDevice                                    _device,
+    const VkPipelineLayoutCreateInfo*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipelineLayout*                           pPipelineLayout)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_pipeline_layout l, *layout;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO);
+
+   l.num_sets = pCreateInfo->setLayoutCount;
+
+   unsigned dynamic_offset_count = 0;
+
+   memset(l.stage, 0, sizeof(l.stage));
+   for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
+      ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout,
+                      pCreateInfo->pSetLayouts[set]);
+      l.set[set].layout = set_layout;
+
+      l.set[set].dynamic_offset_start = dynamic_offset_count;
+      for (uint32_t b = 0; b < set_layout->binding_count; b++) {
+         if (set_layout->binding[b].dynamic_offset_index >= 0)
+            dynamic_offset_count += set_layout->binding[b].array_size;
+      }
+
+      for (gl_shader_stage s = 0; s < MESA_SHADER_STAGES; s++) {
+         l.set[set].stage[s].surface_start = l.stage[s].surface_count;
+         l.set[set].stage[s].sampler_start = l.stage[s].sampler_count;
+         l.set[set].stage[s].image_start = l.stage[s].image_count;
+
+         for (uint32_t b = 0; b < set_layout->binding_count; b++) {
+            unsigned array_size = set_layout->binding[b].array_size;
+
+            if (set_layout->binding[b].stage[s].surface_index >= 0) {
+               l.stage[s].surface_count += array_size;
+
+               if (set_layout->binding[b].dynamic_offset_index >= 0)
+                  l.stage[s].has_dynamic_offsets = true;
+            }
+
+            if (set_layout->binding[b].stage[s].sampler_index >= 0)
+               l.stage[s].sampler_count += array_size;
+
+            if (set_layout->binding[b].stage[s].image_index >= 0)
+               l.stage[s].image_count += array_size;
+         }
+      }
+   }
+
+   unsigned num_bindings = 0;
+   for (gl_shader_stage s = 0; s < MESA_SHADER_STAGES; s++) {
+      num_bindings += l.stage[s].surface_count +
+                      l.stage[s].sampler_count +
+                      l.stage[s].image_count;
+   }
+
+   size_t size = sizeof(*layout) + num_bindings * sizeof(layout->entries[0]);
+
+   layout = anv_alloc2(&device->alloc, pAllocator, size, 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (layout == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* Now we can actually build our surface and sampler maps */
+   struct anv_pipeline_binding *entry = layout->entries;
+   for (gl_shader_stage s = 0; s < MESA_SHADER_STAGES; s++) {
+      l.stage[s].surface_to_descriptor = entry;
+      entry += l.stage[s].surface_count;
+      l.stage[s].sampler_to_descriptor = entry;
+      entry += l.stage[s].sampler_count;
+      entry += l.stage[s].image_count;
+
+      int surface = 0;
+      int sampler = 0;
+      for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
+         struct anv_descriptor_set_layout *set_layout = l.set[set].layout;
+
+         for (uint32_t b = 0; b < set_layout->binding_count; b++) {
+            unsigned array_size = set_layout->binding[b].array_size;
+            unsigned set_offset = set_layout->binding[b].descriptor_index;
+
+            if (set_layout->binding[b].stage[s].surface_index >= 0) {
+               assert(surface == l.set[set].stage[s].surface_start +
+                                 set_layout->binding[b].stage[s].surface_index);
+               for (unsigned i = 0; i < array_size; i++) {
+                  l.stage[s].surface_to_descriptor[surface + i].set = set;
+                  l.stage[s].surface_to_descriptor[surface + i].offset = set_offset + i;
+               }
+               surface += array_size;
+            }
+
+            if (set_layout->binding[b].stage[s].sampler_index >= 0) {
+               assert(sampler == l.set[set].stage[s].sampler_start +
+                                 set_layout->binding[b].stage[s].sampler_index);
+               for (unsigned i = 0; i < array_size; i++) {
+                  l.stage[s].sampler_to_descriptor[sampler + i].set = set;
+                  l.stage[s].sampler_to_descriptor[sampler + i].offset = set_offset + i;
+               }
+               sampler += array_size;
+            }
+         }
+      }
+   }
+
+   /* Finally, we're done setting it up, copy into the allocated version */
+   *layout = l;
+
+   *pPipelineLayout = anv_pipeline_layout_to_handle(layout);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyPipelineLayout(
+    VkDevice                                    _device,
+    VkPipelineLayout                            _pipelineLayout,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, _pipelineLayout);
+
+   anv_free2(&device->alloc, pAllocator, pipeline_layout);
+}
+
+/*
+ * Descriptor pools.  These are a no-op for now.
+ */
+
+VkResult anv_CreateDescriptorPool(
+    VkDevice                                    device,
+    const VkDescriptorPoolCreateInfo*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorPool*                           pDescriptorPool)
+{
+   anv_finishme("VkDescriptorPool is a stub");
+   *pDescriptorPool = (VkDescriptorPool)1;
+   return VK_SUCCESS;
+}
+
+void anv_DestroyDescriptorPool(
+    VkDevice                                    _device,
+    VkDescriptorPool                            _pool,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   anv_finishme("VkDescriptorPool is a stub: free the pool's descriptor sets");
+}
+
+VkResult anv_ResetDescriptorPool(
+    VkDevice                                    device,
+    VkDescriptorPool                            descriptorPool,
+    VkDescriptorPoolResetFlags                  flags)
+{
+   anv_finishme("VkDescriptorPool is a stub: free the pool's descriptor sets");
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_descriptor_set_create(struct anv_device *device,
+                          const struct anv_descriptor_set_layout *layout,
+                          struct anv_descriptor_set **out_set)
+{
+   struct anv_descriptor_set *set;
+   size_t size = sizeof(*set) + layout->size * sizeof(set->descriptors[0]);
+
+   set = anv_alloc(&device->alloc /* XXX: Use the pool */, size, 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!set)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* A descriptor set may not be 100% filled. Clear the set so we can can
+    * later detect holes in it.
+    */
+   memset(set, 0, size);
+
+   set->layout = layout;
+
+   /* Go through and fill out immutable samplers if we have any */
+   struct anv_descriptor *desc = set->descriptors;
+   for (uint32_t b = 0; b < layout->binding_count; b++) {
+      if (layout->binding[b].immutable_samplers) {
+         for (uint32_t i = 0; i < layout->binding[b].array_size; i++)
+            desc[i].sampler = layout->binding[b].immutable_samplers[i];
+      }
+      desc += layout->binding[b].array_size;
+   }
+
+   /* XXX: Use the pool */
+   set->buffer_views =
+      anv_alloc(&device->alloc,
+                sizeof(set->buffer_views[0]) * layout->buffer_count, 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!set->buffer_views) {
+      anv_free(&device->alloc, set);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   for (uint32_t b = 0; b < layout->buffer_count; b++) {
+      set->buffer_views[b].surface_state =
+         anv_state_pool_alloc(&device->surface_state_pool, 64, 64);
+   }
+
+   *out_set = set;
+
+   return VK_SUCCESS;
+}
+
+void
+anv_descriptor_set_destroy(struct anv_device *device,
+                           struct anv_descriptor_set *set)
+{
+   /* XXX: Use the pool */
+   for (uint32_t b = 0; b < set->layout->buffer_count; b++)
+      anv_state_pool_free(&device->surface_state_pool,
+                          set->buffer_views[b].surface_state);
+
+   anv_free(&device->alloc, set->buffer_views);
+   anv_free(&device->alloc, set);
+}
+
+VkResult anv_AllocateDescriptorSets(
+    VkDevice                                    _device,
+    const VkDescriptorSetAllocateInfo*          pAllocateInfo,
+    VkDescriptorSet*                            pDescriptorSets)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   VkResult result = VK_SUCCESS;
+   struct anv_descriptor_set *set;
+   uint32_t i;
+
+   for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) {
+      ANV_FROM_HANDLE(anv_descriptor_set_layout, layout,
+                      pAllocateInfo->pSetLayouts[i]);
+
+      result = anv_descriptor_set_create(device, layout, &set);
+      if (result != VK_SUCCESS)
+         break;
+
+      pDescriptorSets[i] = anv_descriptor_set_to_handle(set);
+   }
+
+   if (result != VK_SUCCESS)
+      anv_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool,
+                             i, pDescriptorSets);
+
+   return result;
+}
+
+VkResult anv_FreeDescriptorSets(
+    VkDevice                                    _device,
+    VkDescriptorPool                            descriptorPool,
+    uint32_t                                    count,
+    const VkDescriptorSet*                      pDescriptorSets)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   for (uint32_t i = 0; i < count; i++) {
+      ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]);
+
+      anv_descriptor_set_destroy(device, set);
+   }
+
+   return VK_SUCCESS;
+}
+
+void anv_UpdateDescriptorSets(
+    VkDevice                                    _device,
+    uint32_t                                    descriptorWriteCount,
+    const VkWriteDescriptorSet*                 pDescriptorWrites,
+    uint32_t                                    descriptorCopyCount,
+    const VkCopyDescriptorSet*                  pDescriptorCopies)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   for (uint32_t i = 0; i < descriptorWriteCount; i++) {
+      const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
+      ANV_FROM_HANDLE(anv_descriptor_set, set, write->dstSet);
+      const struct anv_descriptor_set_binding_layout *bind_layout =
+         &set->layout->binding[write->dstBinding];
+      struct anv_descriptor *desc =
+         &set->descriptors[bind_layout->descriptor_index];
+      desc += write->dstArrayElement;
+
+      switch (write->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_sampler, sampler,
+                            write->pImageInfo[j].sampler);
+
+            desc[j] = (struct anv_descriptor) {
+               .type = VK_DESCRIPTOR_TYPE_SAMPLER,
+               .sampler = sampler,
+            };
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_image_view, iview,
+                            write->pImageInfo[j].imageView);
+            ANV_FROM_HANDLE(anv_sampler, sampler,
+                            write->pImageInfo[j].sampler);
+
+            desc[j].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+            desc[j].image_view = iview;
+
+            /* If this descriptor has an immutable sampler, we don't want
+             * to stomp on it.
+             */
+            if (sampler)
+               desc[j].sampler = sampler;
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_image_view, iview,
+                            write->pImageInfo[j].imageView);
+
+            desc[j] = (struct anv_descriptor) {
+               .type = write->descriptorType,
+               .image_view = iview,
+            };
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_buffer_view, bview,
+                            write->pTexelBufferView[j]);
+
+            desc[j] = (struct anv_descriptor) {
+               .type = write->descriptorType,
+               .buffer_view = bview,
+            };
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         anv_finishme("input attachments not implemented");
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            assert(write->pBufferInfo[j].buffer);
+            ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer);
+            assert(buffer);
+
+            struct anv_buffer_view *view =
+               &set->buffer_views[bind_layout->buffer_index];
+            view += write->dstArrayElement + j;
+
+            const struct anv_format *format =
+               anv_format_for_descriptor_type(write->descriptorType);
+
+            view->format = format->surface_format;
+            view->bo = buffer->bo;
+            view->offset = buffer->offset + write->pBufferInfo[j].offset;
+
+            /* For buffers with dynamic offsets, we use the full possible
+             * range in the surface state and do the actual range-checking
+             * in the shader.
+             */
+            if (bind_layout->dynamic_offset_index >= 0 ||
+                write->pBufferInfo[j].range == VK_WHOLE_SIZE)
+               view->range = buffer->size - write->pBufferInfo[j].offset;
+            else
+               view->range = write->pBufferInfo[j].range;
+
+            anv_fill_buffer_surface_state(device, view->surface_state.map,
+                                          view->format,
+                                          view->offset, view->range, 1);
+
+            if (!device->info.has_llc)
+               anv_state_clflush(view->surface_state);
+
+            desc[j] = (struct anv_descriptor) {
+               .type = write->descriptorType,
+               .buffer_view = view,
+            };
+
+         }
+
+      default:
+         break;
+      }
+   }
+
+   for (uint32_t i = 0; i < descriptorCopyCount; i++) {
+      const VkCopyDescriptorSet *copy = &pDescriptorCopies[i];
+      ANV_FROM_HANDLE(anv_descriptor_set, src, copy->dstSet);
+      ANV_FROM_HANDLE(anv_descriptor_set, dest, copy->dstSet);
+      for (uint32_t j = 0; j < copy->descriptorCount; j++) {
+         dest->descriptors[copy->dstBinding + j] =
+            src->descriptors[copy->srcBinding + j];
+      }
+   }
+}
diff --git a/src/vulkan/anv_device.c b/src/vulkan/anv_device.c

new file mode 100644 (file)

index 0000000..7d0f25e
--- /dev/null
+++ b/src/vulkan/anv_device.c
@@ -0,0 +1,1723 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+#include "mesa/main/git_sha1.h"
+#include "util/strtod.h"
+#include "util/debug.h"
+
+#include "gen7_pack.h"
+
+struct anv_dispatch_table dtable;
+
+static void
+compiler_debug_log(void *data, const char *fmt, ...)
+{ }
+
+static void
+compiler_perf_log(void *data, const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF))
+      vfprintf(stderr, fmt, args);
+
+   va_end(args);
+}
+
+static VkResult
+anv_physical_device_init(struct anv_physical_device *device,
+                         struct anv_instance *instance,
+                         const char *path)
+{
+   VkResult result;
+   int fd;
+
+   fd = open(path, O_RDWR | O_CLOEXEC);
+   if (fd < 0)
+      return vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+                       "failed to open %s: %m", path);
+
+   device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+   device->instance = instance;
+   device->path = path;
+
+   device->chipset_id = anv_gem_get_param(fd, I915_PARAM_CHIPSET_ID);
+   if (!device->chipset_id) {
+      result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+                         "failed to get chipset id: %m");
+      goto fail;
+   }
+
+   device->name = brw_get_device_name(device->chipset_id);
+   device->info = brw_get_device_info(device->chipset_id);
+   if (!device->info) {
+      result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+                         "failed to get device info");
+      goto fail;
+   }
+
+   if (device->info->is_haswell) {
+      fprintf(stderr, "WARNING: Haswell Vulkan support is incomplete\n");
+   } else if (device->info->gen == 7 && !device->info->is_baytrail) {
+      fprintf(stderr, "WARNING: Ivy Bridge Vulkan support is incomplete\n");
+   } else if (device->info->gen == 7 && device->info->is_baytrail) {
+      fprintf(stderr, "WARNING: Bay Trail Vulkan support is incomplete\n");
+   } else if (device->info->gen >= 8) {
+      /* Broadwell, Cherryview, Skylake, Broxton, Kabylake is as fully
+       * supported as anything */
+   } else {
+      result = vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER,
+                         "Vulkan not yet supported on %s", device->name);
+      goto fail;
+   }
+
+   if (anv_gem_get_aperture(fd, &device->aperture_size) == -1) {
+      result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+                         "failed to get aperture size: %m");
+      goto fail;
+   }
+
+   if (!anv_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT)) {
+      result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing gem wait");
+      goto fail;
+   }
+
+   if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2)) {
+      result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing execbuf2");
+      goto fail;
+   }
+
+   if (!device->info->has_llc &&
+       anv_gem_get_param(fd, I915_PARAM_MMAP_VERSION) < 1) {
+      result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing wc mmap");
+      goto fail;
+   }
+
+   bool swizzled = anv_gem_get_bit6_swizzle(fd, I915_TILING_X);
+
+   close(fd);
+
+   brw_process_intel_debug_variable();
+
+   device->compiler = brw_compiler_create(NULL, device->info);
+   if (device->compiler == NULL) {
+      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail;
+   }
+   device->compiler->shader_debug_log = compiler_debug_log;
+   device->compiler->shader_perf_log = compiler_perf_log;
+
+   /* Default to use scalar GS on BDW+ */
+   device->compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
+      device->info->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true);
+
+   /* XXX: Actually detect bit6 swizzling */
+   isl_device_init(&device->isl_dev, device->info, swizzled);
+
+   return VK_SUCCESS;
+
+fail:
+   close(fd);
+   return result;
+}
+
+static void
+anv_physical_device_finish(struct anv_physical_device *device)
+{
+   ralloc_free(device->compiler);
+}
+
+static const VkExtensionProperties global_extensions[] = {
+   {
+      .extensionName = VK_KHR_SURFACE_EXTENSION_NAME,
+      .specVersion = 24,
+   },
+   {
+      .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME,
+      .specVersion = 5,
+   },
+#ifdef HAVE_WAYLAND_PLATFORM
+   {
+      .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME,
+      .specVersion = 4,
+   },
+#endif
+};
+
+static const VkExtensionProperties device_extensions[] = {
+   {
+      .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+      .specVersion = 67,
+   },
+};
+
+static void *
+default_alloc_func(void *pUserData, size_t size, size_t align, 
+                   VkSystemAllocationScope allocationScope)
+{
+   return malloc(size);
+}
+
+static void *
+default_realloc_func(void *pUserData, void *pOriginal, size_t size,
+                     size_t align, VkSystemAllocationScope allocationScope)
+{
+   return realloc(pOriginal, size);
+}
+
+static void
+default_free_func(void *pUserData, void *pMemory)
+{
+   free(pMemory);
+}
+
+static const VkAllocationCallbacks default_alloc = {
+   .pUserData = NULL,
+   .pfnAllocation = default_alloc_func,
+   .pfnReallocation = default_realloc_func,
+   .pfnFree = default_free_func,
+};
+
+VkResult anv_CreateInstance(
+    const VkInstanceCreateInfo*                 pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkInstance*                                 pInstance)
+{
+   struct anv_instance *instance;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO);
+
+   if (pCreateInfo->pApplicationInfo->apiVersion != VK_MAKE_VERSION(1, 0, 0))
+      return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER);
+
+   for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
+      bool found = false;
+      for (uint32_t j = 0; j < ARRAY_SIZE(global_extensions); j++) {
+         if (strcmp(pCreateInfo->ppEnabledExtensionNames[i],
+                    global_extensions[j].extensionName) == 0) {
+            found = true;
+            break;
+         }
+      }
+      if (!found)
+         return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
+   }
+
+   instance = anv_alloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!instance)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+
+   if (pAllocator)
+      instance->alloc = *pAllocator;
+   else
+      instance->alloc = default_alloc;
+
+   instance->apiVersion = pCreateInfo->pApplicationInfo->apiVersion;
+   instance->physicalDeviceCount = -1;
+
+   _mesa_locale_init();
+
+   VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
+
+   anv_init_wsi(instance);
+
+   *pInstance = anv_instance_to_handle(instance);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyInstance(
+    VkInstance                                  _instance,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_instance, instance, _instance);
+
+   if (instance->physicalDeviceCount > 0) {
+      /* We support at most one physical device. */
+      assert(instance->physicalDeviceCount == 1);
+      anv_physical_device_finish(&instance->physicalDevice);
+   }
+
+   anv_finish_wsi(instance);
+
+   VG(VALGRIND_DESTROY_MEMPOOL(instance));
+
+   _mesa_locale_fini();
+
+   anv_free(&instance->alloc, instance);
+}
+
+VkResult anv_EnumeratePhysicalDevices(
+    VkInstance                                  _instance,
+    uint32_t*                                   pPhysicalDeviceCount,
+    VkPhysicalDevice*                           pPhysicalDevices)
+{
+   ANV_FROM_HANDLE(anv_instance, instance, _instance);
+   VkResult result;
+
+   if (instance->physicalDeviceCount < 0) {
+      result = anv_physical_device_init(&instance->physicalDevice,
+                                        instance, "/dev/dri/renderD128");
+      if (result == VK_ERROR_INCOMPATIBLE_DRIVER) {
+         instance->physicalDeviceCount = 0;
+      } else if (result == VK_SUCCESS) {
+         instance->physicalDeviceCount = 1;
+      } else {
+         return result;
+      }
+   }
+
+   /* pPhysicalDeviceCount is an out parameter if pPhysicalDevices is NULL;
+    * otherwise it's an inout parameter.
+    *
+    * The Vulkan spec (git aaed022) says:
+    *
+    *    pPhysicalDeviceCount is a pointer to an unsigned integer variable
+    *    that is initialized with the number of devices the application is
+    *    prepared to receive handles to. pname:pPhysicalDevices is pointer to
+    *    an array of at least this many VkPhysicalDevice handles [...].
+    *
+    *    Upon success, if pPhysicalDevices is NULL, vkEnumeratePhysicalDevices
+    *    overwrites the contents of the variable pointed to by
+    *    pPhysicalDeviceCount with the number of physical devices in in the
+    *    instance; otherwise, vkEnumeratePhysicalDevices overwrites
+    *    pPhysicalDeviceCount with the number of physical handles written to
+    *    pPhysicalDevices.
+    */
+   if (!pPhysicalDevices) {
+      *pPhysicalDeviceCount = instance->physicalDeviceCount;
+   } else if (*pPhysicalDeviceCount >= 1) {
+      pPhysicalDevices[0] = anv_physical_device_to_handle(&instance->physicalDevice);
+      *pPhysicalDeviceCount = 1;
+   } else {
+      *pPhysicalDeviceCount = 0;
+   }
+
+   return VK_SUCCESS;
+}
+
+void anv_GetPhysicalDeviceFeatures(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceFeatures*                   pFeatures)
+{
+   anv_finishme("Get correct values for PhysicalDeviceFeatures");
+
+   *pFeatures = (VkPhysicalDeviceFeatures) {
+      .robustBufferAccess                       = false,
+      .fullDrawIndexUint32                      = false,
+      .imageCubeArray                           = false,
+      .independentBlend                         = false,
+      .geometryShader                           = true,
+      .tessellationShader                       = false,
+      .sampleRateShading                        = false,
+      .dualSrcBlend                             = true,
+      .logicOp                                  = true,
+      .multiDrawIndirect                        = false,
+      .drawIndirectFirstInstance                = false,
+      .depthClamp                               = false,
+      .depthBiasClamp                           = false,
+      .fillModeNonSolid                         = true,
+      .depthBounds                              = false,
+      .wideLines                                = true,
+      .largePoints                              = true,
+      .alphaToOne                               = true,
+      .multiViewport                            = true,
+      .samplerAnisotropy                        = false, /* FINISHME */
+      .textureCompressionETC2                   = true,
+      .textureCompressionASTC_LDR               = true,
+      .textureCompressionBC                     = true,
+      .occlusionQueryPrecise                    = false, /* FINISHME */
+      .pipelineStatisticsQuery                  = true,
+      .vertexPipelineStoresAndAtomics           = false,
+      .fragmentStoresAndAtomics                 = true,
+      .shaderTessellationAndGeometryPointSize   = true,
+      .shaderImageGatherExtended                = true,
+      .shaderStorageImageExtendedFormats        = false,
+      .shaderStorageImageMultisample            = false,
+      .shaderUniformBufferArrayDynamicIndexing  = true,
+      .shaderSampledImageArrayDynamicIndexing   = false,
+      .shaderStorageBufferArrayDynamicIndexing  = false,
+      .shaderStorageImageArrayDynamicIndexing   = false,
+      .shaderStorageImageReadWithoutFormat      = false,
+      .shaderStorageImageWriteWithoutFormat     = true,
+      .shaderClipDistance                       = false,
+      .shaderCullDistance                       = false,
+      .shaderFloat64                            = false,
+      .shaderInt64                              = false,
+      .shaderInt16                              = false,
+      .alphaToOne                               = true,
+      .variableMultisampleRate                  = false,
+      .inheritedQueries                         = false,
+   };
+}
+
+void anv_GetPhysicalDeviceProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceProperties*                 pProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   const struct brw_device_info *devinfo = pdevice->info;
+
+   anv_finishme("Get correct values for VkPhysicalDeviceLimits");
+
+   const float time_stamp_base = devinfo->gen >= 9 ? 83.333 : 80.0;
+
+   VkSampleCountFlags sample_counts =
+      VK_SAMPLE_COUNT_1_BIT |
+      VK_SAMPLE_COUNT_2_BIT |
+      VK_SAMPLE_COUNT_4_BIT |
+      VK_SAMPLE_COUNT_8_BIT;
+
+   VkPhysicalDeviceLimits limits = {
+      .maxImageDimension1D                      = (1 << 14),
+      .maxImageDimension2D                      = (1 << 14),
+      .maxImageDimension3D                      = (1 << 10),
+      .maxImageDimensionCube                    = (1 << 14),
+      .maxImageArrayLayers                      = (1 << 10),
+      .maxTexelBufferElements                   = (1 << 14),
+      .maxUniformBufferRange                    = UINT32_MAX,
+      .maxStorageBufferRange                    = UINT32_MAX,
+      .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
+      .maxMemoryAllocationCount                 = UINT32_MAX,
+      .maxSamplerAllocationCount                = 64 * 1024,
+      .bufferImageGranularity                   = 64, /* A cache line */
+      .sparseAddressSpaceSize                   = 0,
+      .maxBoundDescriptorSets                   = MAX_SETS,
+      .maxPerStageDescriptorSamplers            = 64,
+      .maxPerStageDescriptorUniformBuffers      = 64,
+      .maxPerStageDescriptorStorageBuffers      = 64,
+      .maxPerStageDescriptorSampledImages       = 64,
+      .maxPerStageDescriptorStorageImages       = 64,
+      .maxPerStageDescriptorInputAttachments    = 64,
+      .maxPerStageResources                     = 128,
+      .maxDescriptorSetSamplers                 = 256,
+      .maxDescriptorSetUniformBuffers           = 256,
+      .maxDescriptorSetUniformBuffersDynamic    = 256,
+      .maxDescriptorSetStorageBuffers           = 256,
+      .maxDescriptorSetStorageBuffersDynamic    = 256,
+      .maxDescriptorSetSampledImages            = 256,
+      .maxDescriptorSetStorageImages            = 256,
+      .maxDescriptorSetInputAttachments         = 256,
+      .maxVertexInputAttributes                 = 32,
+      .maxVertexInputBindings                   = 32,
+      .maxVertexInputAttributeOffset            = 256,
+      .maxVertexInputBindingStride              = 256,
+      .maxVertexOutputComponents                = 32,
+      .maxTessellationGenerationLevel           = 0,
+      .maxTessellationPatchSize                 = 0,
+      .maxTessellationControlPerVertexInputComponents = 0,
+      .maxTessellationControlPerVertexOutputComponents = 0,
+      .maxTessellationControlPerPatchOutputComponents = 0,
+      .maxTessellationControlTotalOutputComponents = 0,
+      .maxTessellationEvaluationInputComponents = 0,
+      .maxTessellationEvaluationOutputComponents = 0,
+      .maxGeometryShaderInvocations             = 6,
+      .maxGeometryInputComponents               = 16,
+      .maxGeometryOutputComponents              = 16,
+      .maxGeometryOutputVertices                = 16,
+      .maxGeometryTotalOutputComponents         = 16,
+      .maxFragmentInputComponents               = 16,
+      .maxFragmentOutputAttachments             = 8,
+      .maxFragmentDualSrcAttachments            = 2,
+      .maxFragmentCombinedOutputResources       = 8,
+      .maxComputeSharedMemorySize               = 1024,
+      .maxComputeWorkGroupCount = {
+         16 * devinfo->max_cs_threads,
+         16 * devinfo->max_cs_threads,
+         16 * devinfo->max_cs_threads,
+      },
+      .maxComputeWorkGroupInvocations           = 16 * devinfo->max_cs_threads,
+      .maxComputeWorkGroupSize = {
+         16 * devinfo->max_cs_threads,
+         16 * devinfo->max_cs_threads,
+         16 * devinfo->max_cs_threads,
+      },
+      .subPixelPrecisionBits                    = 4 /* FIXME */,
+      .subTexelPrecisionBits                    = 4 /* FIXME */,
+      .mipmapPrecisionBits                      = 4 /* FIXME */,
+      .maxDrawIndexedIndexValue                 = UINT32_MAX,
+      .maxDrawIndirectCount                     = UINT32_MAX,
+      .maxSamplerLodBias                        = 16,
+      .maxSamplerAnisotropy                     = 16,
+      .maxViewports                             = MAX_VIEWPORTS,
+      .maxViewportDimensions                    = { (1 << 14), (1 << 14) },
+      .viewportBoundsRange                      = { -1.0, 1.0 }, /* FIXME */
+      .viewportSubPixelBits                     = 13, /* We take a float? */
+      .minMemoryMapAlignment                    = 4096, /* A page */
+      .minTexelBufferOffsetAlignment            = 1,
+      .minUniformBufferOffsetAlignment          = 1,
+      .minStorageBufferOffsetAlignment          = 1,
+      .minTexelOffset                           = 0, /* FIXME */
+      .maxTexelOffset                           = 0, /* FIXME */
+      .minTexelGatherOffset                     = 0, /* FIXME */
+      .maxTexelGatherOffset                     = 0, /* FIXME */
+      .minInterpolationOffset                   = 0, /* FIXME */
+      .maxInterpolationOffset                   = 0, /* FIXME */
+      .subPixelInterpolationOffsetBits          = 0, /* FIXME */
+      .maxFramebufferWidth                      = (1 << 14),
+      .maxFramebufferHeight                     = (1 << 14),
+      .maxFramebufferLayers                     = (1 << 10),
+      .framebufferColorSampleCounts             = sample_counts,
+      .framebufferDepthSampleCounts             = sample_counts,
+      .framebufferStencilSampleCounts           = sample_counts,
+      .framebufferNoAttachmentsSampleCounts     = sample_counts,
+      .maxColorAttachments                      = MAX_RTS,
+      .sampledImageColorSampleCounts            = sample_counts,
+      .sampledImageIntegerSampleCounts          = VK_SAMPLE_COUNT_1_BIT,
+      .sampledImageDepthSampleCounts            = sample_counts,
+      .sampledImageStencilSampleCounts          = sample_counts,
+      .storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
+      .maxSampleMaskWords                       = 1,
+      .timestampComputeAndGraphics              = false,
+      .timestampPeriod                          = time_stamp_base / (1000 * 1000 * 1000),
+      .maxClipDistances                         = 0 /* FIXME */,
+      .maxCullDistances                         = 0 /* FIXME */,
+      .maxCombinedClipAndCullDistances          = 0 /* FIXME */,
+      .discreteQueuePriorities                  = 1,
+      .pointSizeRange                           = { 0.125, 255.875 },
+      .lineWidthRange                           = { 0.0, 7.9921875 },
+      .pointSizeGranularity                     = (1.0 / 8.0),
+      .lineWidthGranularity                     = (1.0 / 128.0),
+      .strictLines                              = false, /* FINISHME */
+      .standardSampleLocations                  = true, /* FINISHME */
+      .optimalBufferCopyOffsetAlignment         = 128,
+      .optimalBufferCopyRowPitchAlignment       = 128,
+      .nonCoherentAtomSize                      = 64,
+   };
+
+   *pProperties = (VkPhysicalDeviceProperties) {
+      .apiVersion = VK_MAKE_VERSION(1, 0, 0),
+      .driverVersion = 1,
+      .vendorID = 0x8086,
+      .deviceID = pdevice->chipset_id,
+      .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
+      .limits = limits,
+      .sparseProperties = {0}, /* Broadwell doesn't do sparse. */
+   };
+
+   strcpy(pProperties->deviceName, pdevice->name);
+   snprintf((char *)pProperties->pipelineCacheUUID, VK_UUID_SIZE,
+            "anv-%s", MESA_GIT_SHA1 + 4);
+}
+
+void anv_GetPhysicalDeviceQueueFamilyProperties(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pCount,
+    VkQueueFamilyProperties*                    pQueueFamilyProperties)
+{
+   if (pQueueFamilyProperties == NULL) {
+      *pCount = 1;
+      return;
+   }
+
+   assert(*pCount >= 1);
+
+   *pQueueFamilyProperties = (VkQueueFamilyProperties) {
+      .queueFlags = VK_QUEUE_GRAPHICS_BIT |
+                    VK_QUEUE_COMPUTE_BIT |
+                    VK_QUEUE_TRANSFER_BIT,
+      .queueCount = 1,
+      .timestampValidBits = 36, /* XXX: Real value here */
+      .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
+   };
+}
+
+void anv_GetPhysicalDeviceMemoryProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceMemoryProperties*           pMemoryProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   VkDeviceSize heap_size;
+
+   /* Reserve some wiggle room for the driver by exposing only 75% of the
+    * aperture to the heap.
+    */
+   heap_size = 3 * physical_device->aperture_size / 4;
+
+   if (physical_device->info->has_llc) {
+      /* Big core GPUs share LLC with the CPU and thus one memory type can be
+       * both cached and coherent at the same time.
+       */
+      pMemoryProperties->memoryTypeCount = 1;
+      pMemoryProperties->memoryTypes[0] = (VkMemoryType) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
+      };
+   } else {
+      /* The spec requires that we expose a host-visible, coherent memory
+       * type, but Atom GPUs don't share LLC. Thus we offer two memory types
+       * to give the application a choice between cached, but not coherent and
+       * coherent but uncached (WC though).
+       */
+      pMemoryProperties->memoryTypeCount = 2;
+      pMemoryProperties->memoryTypes[0] = (VkMemoryType) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         .heapIndex = 0,
+      };
+      pMemoryProperties->memoryTypes[1] = (VkMemoryType) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
+      };
+   }
+
+   pMemoryProperties->memoryHeapCount = 1;
+   pMemoryProperties->memoryHeaps[0] = (VkMemoryHeap) {
+      .size = heap_size,
+      .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+   };
+}
+
+PFN_vkVoidFunction anv_GetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName)
+{
+   return anv_lookup_entrypoint(pName);
+}
+
+PFN_vkVoidFunction anv_GetDeviceProcAddr(
+    VkDevice                                    device,
+    const char*                                 pName)
+{
+   return anv_lookup_entrypoint(pName);
+}
+
+static VkResult
+anv_queue_init(struct anv_device *device, struct anv_queue *queue)
+{
+   queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+   queue->device = device;
+   queue->pool = &device->surface_state_pool;
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_queue_finish(struct anv_queue *queue)
+{
+}
+
+static struct anv_state
+anv_state_pool_emit_data(struct anv_state_pool *pool, size_t size, size_t align, const void *p)
+{
+   struct anv_state state;
+
+   state = anv_state_pool_alloc(pool, size, align);
+   memcpy(state.map, p, size);
+
+   if (!pool->block_pool->device->info.has_llc)
+      anv_state_clflush(state);
+
+   return state;
+}
+
+struct gen8_border_color {
+   union {
+      float float32[4];
+      uint32_t uint32[4];
+   };
+   /* Pad out to 64 bytes */
+   uint32_t _pad[12];
+};
+
+static void
+anv_device_init_border_colors(struct anv_device *device)
+{
+   static const struct gen8_border_color border_colors[] = {
+      [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] =  { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
+      [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] =       { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
+      [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] =       { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
+      [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] =    { .uint32 = { 0, 0, 0, 0 } },
+      [VK_BORDER_COLOR_INT_OPAQUE_BLACK] =         { .uint32 = { 0, 0, 0, 1 } },
+      [VK_BORDER_COLOR_INT_OPAQUE_WHITE] =         { .uint32 = { 1, 1, 1, 1 } },
+   };
+
+   device->border_colors = anv_state_pool_emit_data(&device->dynamic_state_pool,
+                                                    sizeof(border_colors), 64,
+                                                    border_colors);
+}
+
+VkResult anv_CreateDevice(
+    VkPhysicalDevice                            physicalDevice,
+    const VkDeviceCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDevice*                                   pDevice)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   VkResult result;
+   struct anv_device *device;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
+
+   for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
+      bool found = false;
+      for (uint32_t j = 0; j < ARRAY_SIZE(device_extensions); j++) {
+         if (strcmp(pCreateInfo->ppEnabledExtensionNames[i],
+                    device_extensions[j].extensionName) == 0) {
+            found = true;
+            break;
+         }
+      }
+      if (!found)
+         return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
+   }
+
+   anv_set_dispatch_devinfo(physical_device->info);
+
+   device = anv_alloc2(&physical_device->instance->alloc, pAllocator,
+                       sizeof(*device), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!device)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+   device->instance = physical_device->instance;
+
+   if (pAllocator)
+      device->alloc = *pAllocator;
+   else
+      device->alloc = physical_device->instance->alloc;
+
+   /* XXX(chadv): Can we dup() physicalDevice->fd here? */
+   device->fd = open(physical_device->path, O_RDWR | O_CLOEXEC);
+   if (device->fd == -1) {
+      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_device;
+   }
+
+   device->context_id = anv_gem_create_context(device);
+   if (device->context_id == -1) {
+      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_fd;
+   }
+
+   device->info = *physical_device->info;
+   device->isl_dev = physical_device->isl_dev;
+
+   pthread_mutex_init(&device->mutex, NULL);
+
+   anv_bo_pool_init(&device->batch_bo_pool, device, ANV_CMD_BUFFER_BATCH_SIZE);
+
+   anv_block_pool_init(&device->dynamic_state_block_pool, device, 16384);
+
+   anv_state_pool_init(&device->dynamic_state_pool,
+                       &device->dynamic_state_block_pool);
+
+   anv_block_pool_init(&device->instruction_block_pool, device, 128 * 1024);
+   anv_pipeline_cache_init(&device->default_pipeline_cache, device);
+
+   anv_block_pool_init(&device->surface_state_block_pool, device, 4096);
+
+   anv_state_pool_init(&device->surface_state_pool,
+                       &device->surface_state_block_pool);
+
+   anv_bo_init_new(&device->workaround_bo, device, 1024);
+
+   anv_block_pool_init(&device->scratch_block_pool, device, 0x10000);
+
+   anv_queue_init(device, &device->queue);
+
+   result = anv_device_init_meta(device);
+   if (result != VK_SUCCESS)
+      goto fail_fd;
+
+   anv_device_init_border_colors(device);
+
+   *pDevice = anv_device_to_handle(device);
+
+   return VK_SUCCESS;
+
+ fail_fd:
+   close(device->fd);
+ fail_device:
+   anv_free(&device->alloc, device);
+
+   return result;
+}
+
+void anv_DestroyDevice(
+    VkDevice                                    _device,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   anv_queue_finish(&device->queue);
+
+   anv_device_finish_meta(device);
+
+#ifdef HAVE_VALGRIND
+   /* We only need to free these to prevent valgrind errors.  The backing
+    * BO will go away in a couple of lines so we don't actually leak.
+    */
+   anv_state_pool_free(&device->dynamic_state_pool, device->border_colors);
+#endif
+
+   anv_gem_munmap(device->workaround_bo.map, device->workaround_bo.size);
+   anv_gem_close(device, device->workaround_bo.gem_handle);
+
+   anv_bo_pool_finish(&device->batch_bo_pool);
+   anv_state_pool_finish(&device->dynamic_state_pool);
+   anv_block_pool_finish(&device->dynamic_state_block_pool);
+   anv_block_pool_finish(&device->instruction_block_pool);
+   anv_state_pool_finish(&device->surface_state_pool);
+   anv_block_pool_finish(&device->surface_state_block_pool);
+   anv_block_pool_finish(&device->scratch_block_pool);
+
+   close(device->fd);
+
+   pthread_mutex_destroy(&device->mutex);
+
+   anv_free(&device->alloc, device);
+}
+
+VkResult anv_EnumerateInstanceExtensionProperties(
+    const char*                                 pLayerName,
+    uint32_t*                                   pPropertyCount,
+    VkExtensionProperties*                      pProperties)
+{
+   if (pProperties == NULL) {
+      *pPropertyCount = ARRAY_SIZE(global_extensions);
+      return VK_SUCCESS;
+   }
+
+   assert(*pPropertyCount >= ARRAY_SIZE(global_extensions));
+
+   *pPropertyCount = ARRAY_SIZE(global_extensions);
+   memcpy(pProperties, global_extensions, sizeof(global_extensions));
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_EnumerateDeviceExtensionProperties(
+    VkPhysicalDevice                            physicalDevice,
+    const char*                                 pLayerName,
+    uint32_t*                                   pPropertyCount,
+    VkExtensionProperties*                      pProperties)
+{
+   if (pProperties == NULL) {
+      *pPropertyCount = ARRAY_SIZE(device_extensions);
+      return VK_SUCCESS;
+   }
+
+   assert(*pPropertyCount >= ARRAY_SIZE(device_extensions));
+
+   *pPropertyCount = ARRAY_SIZE(device_extensions);
+   memcpy(pProperties, device_extensions, sizeof(device_extensions));
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_EnumerateInstanceLayerProperties(
+    uint32_t*                                   pPropertyCount,
+    VkLayerProperties*                          pProperties)
+{
+   if (pProperties == NULL) {
+      *pPropertyCount = 0;
+      return VK_SUCCESS;
+   }
+
+   /* None supported at this time */
+   return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
+}
+
+VkResult anv_EnumerateDeviceLayerProperties(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkLayerProperties*                          pProperties)
+{
+   if (pProperties == NULL) {
+      *pPropertyCount = 0;
+      return VK_SUCCESS;
+   }
+
+   /* None supported at this time */
+   return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
+}
+
+void anv_GetDeviceQueue(
+    VkDevice                                    _device,
+    uint32_t                                    queueNodeIndex,
+    uint32_t                                    queueIndex,
+    VkQueue*                                    pQueue)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   assert(queueIndex == 0);
+
+   *pQueue = anv_queue_to_handle(&device->queue);
+}
+
+VkResult anv_QueueSubmit(
+    VkQueue                                     _queue,
+    uint32_t                                    submitCount,
+    const VkSubmitInfo*                         pSubmits,
+    VkFence                                     _fence)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+   struct anv_device *device = queue->device;
+   int ret;
+
+   for (uint32_t i = 0; i < submitCount; i++) {
+      for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
+         ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
+                         pSubmits[i].pCommandBuffers[j]);
+         assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
+         ret = anv_gem_execbuffer(device, &cmd_buffer->execbuf2.execbuf);
+         if (ret != 0) {
+            /* We don't know the real error. */
+            return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                             "execbuf2 failed: %m");
+         }
+
+         if (fence) {
+            ret = anv_gem_execbuffer(device, &fence->execbuf);
+            if (ret != 0) {
+               /* We don't know the real error. */
+               return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                                "execbuf2 failed: %m");
+            }
+         }
+
+         for (uint32_t k = 0; k < cmd_buffer->execbuf2.bo_count; k++)
+            cmd_buffer->execbuf2.bos[k]->offset = cmd_buffer->execbuf2.objects[k].offset;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_QueueWaitIdle(
+    VkQueue                                     _queue)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+
+   return ANV_CALL(DeviceWaitIdle)(anv_device_to_handle(queue->device));
+}
+
+VkResult anv_DeviceWaitIdle(
+    VkDevice                                    _device)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_state state;
+   struct anv_batch batch;
+   struct drm_i915_gem_execbuffer2 execbuf;
+   struct drm_i915_gem_exec_object2 exec2_objects[1];
+   struct anv_bo *bo = NULL;
+   VkResult result;
+   int64_t timeout;
+   int ret;
+
+   state = anv_state_pool_alloc(&device->dynamic_state_pool, 32, 32);
+   bo = &device->dynamic_state_pool.block_pool->bo;
+   batch.start = batch.next = state.map;
+   batch.end = state.map + 32;
+   anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END);
+   anv_batch_emit(&batch, GEN7_MI_NOOP);
+
+   if (!device->info.has_llc)
+      anv_state_clflush(state);
+
+   exec2_objects[0].handle = bo->gem_handle;
+   exec2_objects[0].relocation_count = 0;
+   exec2_objects[0].relocs_ptr = 0;
+   exec2_objects[0].alignment = 0;
+   exec2_objects[0].offset = bo->offset;
+   exec2_objects[0].flags = 0;
+   exec2_objects[0].rsvd1 = 0;
+   exec2_objects[0].rsvd2 = 0;
+
+   execbuf.buffers_ptr = (uintptr_t) exec2_objects;
+   execbuf.buffer_count = 1;
+   execbuf.batch_start_offset = state.offset;
+   execbuf.batch_len = batch.next - state.map;
+   execbuf.cliprects_ptr = 0;
+   execbuf.num_cliprects = 0;
+   execbuf.DR1 = 0;
+   execbuf.DR4 = 0;
+
+   execbuf.flags =
+      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
+   execbuf.rsvd1 = device->context_id;
+   execbuf.rsvd2 = 0;
+
+   ret = anv_gem_execbuffer(device, &execbuf);
+   if (ret != 0) {
+      /* We don't know the real error. */
+      result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m");
+      goto fail;
+   }
+
+   timeout = INT64_MAX;
+   ret = anv_gem_wait(device, bo->gem_handle, &timeout);
+   if (ret != 0) {
+      /* We don't know the real error. */
+      result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m");
+      goto fail;
+   }
+
+   anv_state_pool_free(&device->dynamic_state_pool, state);
+
+   return VK_SUCCESS;
+
+ fail:
+   anv_state_pool_free(&device->dynamic_state_pool, state);
+
+   return result;
+}
+
+VkResult
+anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size)
+{
+   bo->gem_handle = anv_gem_create(device, size);
+   if (!bo->gem_handle)
+      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   bo->map = NULL;
+   bo->index = 0;
+   bo->offset = 0;
+   bo->size = size;
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_AllocateMemory(
+    VkDevice                                    _device,
+    const VkMemoryAllocateInfo*                 pAllocateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDeviceMemory*                             pMem)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_device_memory *mem;
+   VkResult result;
+
+   assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
+
+   if (pAllocateInfo->allocationSize == 0) {
+      /* Apparently, this is allowed */
+      *pMem = VK_NULL_HANDLE;
+      return VK_SUCCESS;
+   }
+
+   /* We support exactly one memory heap. */
+   assert(pAllocateInfo->memoryTypeIndex == 0 ||
+          (!device->info.has_llc && pAllocateInfo->memoryTypeIndex < 2));
+
+   /* FINISHME: Fail if allocation request exceeds heap size. */
+
+   mem = anv_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (mem == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* The kernel is going to give us whole pages anyway */
+   uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
+
+   result = anv_bo_init_new(&mem->bo, device, alloc_size);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   mem->type_index = pAllocateInfo->memoryTypeIndex;
+
+   *pMem = anv_device_memory_to_handle(mem);
+
+   return VK_SUCCESS;
+
+ fail:
+   anv_free2(&device->alloc, pAllocator, mem);
+
+   return result;
+}
+
+void anv_FreeMemory(
+    VkDevice                                    _device,
+    VkDeviceMemory                              _mem,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_device_memory, mem, _mem);
+
+   if (mem == NULL)
+      return;
+
+   if (mem->bo.map)
+      anv_gem_munmap(mem->bo.map, mem->bo.size);
+
+   if (mem->bo.gem_handle != 0)
+      anv_gem_close(device, mem->bo.gem_handle);
+
+   anv_free2(&device->alloc, pAllocator, mem);
+}
+
+VkResult anv_MapMemory(
+    VkDevice                                    _device,
+    VkDeviceMemory                              _memory,
+    VkDeviceSize                                offset,
+    VkDeviceSize                                size,
+    VkMemoryMapFlags                            flags,
+    void**                                      ppData)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+
+   if (mem == NULL) {
+      *ppData = NULL;
+      return VK_SUCCESS;
+   }
+
+   if (size == VK_WHOLE_SIZE)
+      size = mem->bo.size - offset;
+
+   /* FIXME: Is this supposed to be thread safe? Since vkUnmapMemory() only
+    * takes a VkDeviceMemory pointer, it seems like only one map of the memory
+    * at a time is valid. We could just mmap up front and return an offset
+    * pointer here, but that may exhaust virtual memory on 32 bit
+    * userspace. */
+
+   uint32_t gem_flags = 0;
+   if (!device->info.has_llc && mem->type_index == 0)
+      gem_flags |= I915_MMAP_WC;
+
+   /* GEM will fail to map if the offset isn't 4k-aligned.  Round down. */
+   uint64_t map_offset = offset & ~4095ull;
+   assert(offset >= map_offset);
+   uint64_t map_size = (offset + size) - map_offset;
+
+   /* Let's map whole pages */
+   map_size = align_u64(map_size, 4096);
+
+   mem->map = anv_gem_mmap(device, mem->bo.gem_handle,
+                           map_offset, map_size, gem_flags);
+   mem->map_size = map_size;
+
+   *ppData = mem->map + (offset - map_offset);
+
+   return VK_SUCCESS;
+}
+
+void anv_UnmapMemory(
+    VkDevice                                    _device,
+    VkDeviceMemory                              _memory)
+{
+   ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+
+   if (mem == NULL)
+      return;
+
+   anv_gem_munmap(mem->map, mem->map_size);
+}
+
+static void
+clflush_mapped_ranges(struct anv_device         *device,
+                      uint32_t                   count,
+                      const VkMappedMemoryRange *ranges)
+{
+   for (uint32_t i = 0; i < count; i++) {
+      ANV_FROM_HANDLE(anv_device_memory, mem, ranges[i].memory);
+      void *p = mem->map + (ranges[i].offset & ~CACHELINE_MASK);
+      void *end = mem->map + ranges[i].offset + ranges[i].size;
+
+      while (p < end) {
+         __builtin_ia32_clflush(p);
+         p += CACHELINE_SIZE;
+      }
+   }
+}
+
+VkResult anv_FlushMappedMemoryRanges(
+    VkDevice                                    _device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (device->info.has_llc)
+      return VK_SUCCESS;
+
+   /* Make sure the writes we're flushing have landed. */
+   __builtin_ia32_sfence();
+
+   clflush_mapped_ranges(device, memoryRangeCount, pMemoryRanges);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_InvalidateMappedMemoryRanges(
+    VkDevice                                    _device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (device->info.has_llc)
+      return VK_SUCCESS;
+
+   clflush_mapped_ranges(device, memoryRangeCount, pMemoryRanges);
+
+   /* Make sure no reads get moved up above the invalidate. */
+   __builtin_ia32_lfence();
+
+   return VK_SUCCESS;
+}
+
+void anv_GetBufferMemoryRequirements(
+    VkDevice                                    device,
+    VkBuffer                                    _buffer,
+    VkMemoryRequirements*                       pMemoryRequirements)
+{
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   /* The Vulkan spec (git aaed022) says:
+    *
+    *    memoryTypeBits is a bitfield and contains one bit set for every
+    *    supported memory type for the resource. The bit `1<<i` is set if and
+    *    only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
+    *    structure for the physical device is supported.
+    *
+    * We support exactly one memory type.
+    */
+   pMemoryRequirements->memoryTypeBits = 1;
+
+   pMemoryRequirements->size = buffer->size;
+   pMemoryRequirements->alignment = 16;
+}
+
+void anv_GetImageMemoryRequirements(
+    VkDevice                                    device,
+    VkImage                                     _image,
+    VkMemoryRequirements*                       pMemoryRequirements)
+{
+   ANV_FROM_HANDLE(anv_image, image, _image);
+
+   /* The Vulkan spec (git aaed022) says:
+    *
+    *    memoryTypeBits is a bitfield and contains one bit set for every
+    *    supported memory type for the resource. The bit `1<<i` is set if and
+    *    only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
+    *    structure for the physical device is supported.
+    *
+    * We support exactly one memory type.
+    */
+   pMemoryRequirements->memoryTypeBits = 1;
+
+   pMemoryRequirements->size = image->size;
+   pMemoryRequirements->alignment = image->alignment;
+}
+
+void anv_GetImageSparseMemoryRequirements(
+    VkDevice                                    device,
+    VkImage                                     image,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements*            pSparseMemoryRequirements)
+{
+   stub();
+}
+
+void anv_GetDeviceMemoryCommitment(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    VkDeviceSize*                               pCommittedMemoryInBytes)
+{
+   *pCommittedMemoryInBytes = 0;
+}
+
+VkResult anv_BindBufferMemory(
+    VkDevice                                    device,
+    VkBuffer                                    _buffer,
+    VkDeviceMemory                              _memory,
+    VkDeviceSize                                memoryOffset)
+{
+   ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   if (mem) {
+      buffer->bo = &mem->bo;
+      buffer->offset = memoryOffset;
+   } else {
+      buffer->bo = NULL;
+      buffer->offset = 0;
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_BindImageMemory(
+    VkDevice                                    device,
+    VkImage                                     _image,
+    VkDeviceMemory                              _memory,
+    VkDeviceSize                                memoryOffset)
+{
+   ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+   ANV_FROM_HANDLE(anv_image, image, _image);
+
+   if (mem) {
+      image->bo = &mem->bo;
+      image->offset = memoryOffset;
+   } else {
+      image->bo = NULL;
+      image->offset = 0;
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_QueueBindSparse(
+    VkQueue                                     queue,
+    uint32_t                                    bindInfoCount,
+    const VkBindSparseInfo*                     pBindInfo,
+    VkFence                                     fence)
+{
+   stub_return(VK_ERROR_INCOMPATIBLE_DRIVER);
+}
+
+VkResult anv_CreateFence(
+    VkDevice                                    _device,
+    const VkFenceCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkFence*                                    pFence)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_fence *fence;
+   struct anv_batch batch;
+   VkResult result;
+
+   const uint32_t fence_size = 128;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
+
+   fence = anv_alloc2(&device->alloc, pAllocator, sizeof(*fence), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (fence == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_bo_init_new(&fence->bo, device, fence_size);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   fence->bo.map =
+      anv_gem_mmap(device, fence->bo.gem_handle, 0, fence->bo.size, 0);
+   batch.next = batch.start = fence->bo.map;
+   batch.end = fence->bo.map + fence->bo.size;
+   anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END);
+   anv_batch_emit(&batch, GEN7_MI_NOOP);
+
+   if (!device->info.has_llc) {
+      assert(((uintptr_t) fence->bo.map & CACHELINE_MASK) == 0);
+      assert(batch.next - fence->bo.map <= CACHELINE_SIZE);
+      __builtin_ia32_sfence();
+      __builtin_ia32_clflush(fence->bo.map);
+   }
+
+   fence->exec2_objects[0].handle = fence->bo.gem_handle;
+   fence->exec2_objects[0].relocation_count = 0;
+   fence->exec2_objects[0].relocs_ptr = 0;
+   fence->exec2_objects[0].alignment = 0;
+   fence->exec2_objects[0].offset = fence->bo.offset;
+   fence->exec2_objects[0].flags = 0;
+   fence->exec2_objects[0].rsvd1 = 0;
+   fence->exec2_objects[0].rsvd2 = 0;
+
+   fence->execbuf.buffers_ptr = (uintptr_t) fence->exec2_objects;
+   fence->execbuf.buffer_count = 1;
+   fence->execbuf.batch_start_offset = 0;
+   fence->execbuf.batch_len = batch.next - fence->bo.map;
+   fence->execbuf.cliprects_ptr = 0;
+   fence->execbuf.num_cliprects = 0;
+   fence->execbuf.DR1 = 0;
+   fence->execbuf.DR4 = 0;
+
+   fence->execbuf.flags =
+      I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
+   fence->execbuf.rsvd1 = device->context_id;
+   fence->execbuf.rsvd2 = 0;
+
+   *pFence = anv_fence_to_handle(fence);
+
+   return VK_SUCCESS;
+
+ fail:
+   anv_free2(&device->alloc, pAllocator, fence);
+
+   return result;
+}
+
+void anv_DestroyFence(
+    VkDevice                                    _device,
+    VkFence                                     _fence,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+
+   anv_gem_munmap(fence->bo.map, fence->bo.size);
+   anv_gem_close(device, fence->bo.gem_handle);
+   anv_free2(&device->alloc, pAllocator, fence);
+}
+
+VkResult anv_ResetFences(
+    VkDevice                                    _device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences)
+{
+   for (uint32_t i = 0; i < fenceCount; i++) {
+      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+      fence->ready = false;
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_GetFenceStatus(
+    VkDevice                                    _device,
+    VkFence                                     _fence)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+   int64_t t = 0;
+   int ret;
+
+   if (fence->ready)
+      return VK_SUCCESS;
+
+   ret = anv_gem_wait(device, fence->bo.gem_handle, &t);
+   if (ret == 0) {
+      fence->ready = true;
+      return VK_SUCCESS;
+   }
+
+   return VK_NOT_READY;
+}
+
+VkResult anv_WaitForFences(
+    VkDevice                                    _device,
+    uint32_t                                    fenceCount,
+    const VkFence*                              pFences,
+    VkBool32                                    waitAll,
+    uint64_t                                    timeout)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
+    * to block indefinitely timeouts <= 0.  Unfortunately, this was broken
+    * for a couple of kernel releases.  Since there's no way to know
+    * whether or not the kernel we're using is one of the broken ones, the
+    * best we can do is to clamp the timeout to INT64_MAX.  This limits the
+    * maximum timeout from 584 years to 292 years - likely not a big deal.
+    */
+   if (timeout > INT64_MAX)
+      timeout = INT64_MAX;
+
+   int64_t t = timeout;
+
+   /* FIXME: handle !waitAll */
+
+   for (uint32_t i = 0; i < fenceCount; i++) {
+      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+      int ret = anv_gem_wait(device, fence->bo.gem_handle, &t);
+      if (ret == -1 && errno == ETIME) {
+         return VK_TIMEOUT;
+      } else if (ret == -1) {
+         /* We don't know the real error. */
+         return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "gem wait failed: %m");
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+// Queue semaphore functions
+
+VkResult anv_CreateSemaphore(
+    VkDevice                                    device,
+    const VkSemaphoreCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSemaphore*                                pSemaphore)
+{
+   /* The DRM execbuffer ioctl always execute in-oder, even between different
+    * rings. As such, there's nothing to do for the user space semaphore.
+    */
+
+   *pSemaphore = (VkSemaphore)1;
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroySemaphore(
+    VkDevice                                    device,
+    VkSemaphore                                 semaphore,
+    const VkAllocationCallbacks*                pAllocator)
+{
+}
+
+// Event functions
+
+VkResult anv_CreateEvent(
+    VkDevice                                    _device,
+    const VkEventCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkEvent*                                    pEvent)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_state state;
+   struct anv_event *event;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_EVENT_CREATE_INFO);
+
+   state = anv_state_pool_alloc(&device->dynamic_state_pool,
+                                sizeof(*event), 4);
+   event = state.map;
+   event->state = state;
+   event->semaphore = VK_EVENT_RESET;
+
+   if (!device->info.has_llc) {
+      /* Make sure the writes we're flushing have landed. */
+      __builtin_ia32_sfence();
+      __builtin_ia32_clflush(event);
+   }
+
+   *pEvent = anv_event_to_handle(event);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyEvent(
+    VkDevice                                    _device,
+    VkEvent                                     _event,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   anv_state_pool_free(&device->dynamic_state_pool, event->state);
+}
+
+VkResult anv_GetEventStatus(
+    VkDevice                                    _device,
+    VkEvent                                     _event)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   if (!device->info.has_llc) {
+      /* Make sure the writes we're flushing have landed. */
+      __builtin_ia32_clflush(event);
+      __builtin_ia32_lfence();
+   }
+
+   return event->semaphore;
+}
+
+VkResult anv_SetEvent(
+    VkDevice                                    _device,
+    VkEvent                                     _event)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   event->semaphore = VK_EVENT_SET;
+
+   if (!device->info.has_llc) {
+      /* Make sure the writes we're flushing have landed. */
+      __builtin_ia32_sfence();
+      __builtin_ia32_clflush(event);
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_ResetEvent(
+    VkDevice                                    _device,
+    VkEvent                                     _event)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   event->semaphore = VK_EVENT_RESET;
+
+   if (!device->info.has_llc) {
+      /* Make sure the writes we're flushing have landed. */
+      __builtin_ia32_sfence();
+      __builtin_ia32_clflush(event);
+   }
+
+   return VK_SUCCESS;
+}
+
+// Buffer functions
+
+VkResult anv_CreateBuffer(
+    VkDevice                                    _device,
+    const VkBufferCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkBuffer*                                   pBuffer)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_buffer *buffer;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO);
+
+   buffer = anv_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (buffer == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   buffer->size = pCreateInfo->size;
+   buffer->usage = pCreateInfo->usage;
+   buffer->bo = NULL;
+   buffer->offset = 0;
+
+   *pBuffer = anv_buffer_to_handle(buffer);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyBuffer(
+    VkDevice                                    _device,
+    VkBuffer                                    _buffer,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   anv_free2(&device->alloc, pAllocator, buffer);
+}
+
+void
+anv_fill_buffer_surface_state(struct anv_device *device, void *state,
+                              enum isl_format format,
+                              uint32_t offset, uint32_t range, uint32_t stride)
+{
+   switch (device->info.gen) {
+   case 7:
+      if (device->info.is_haswell)
+         gen75_fill_buffer_surface_state(state, format, offset, range, stride);
+      else
+         gen7_fill_buffer_surface_state(state, format, offset, range, stride);
+      break;
+   case 8:
+      gen8_fill_buffer_surface_state(state, format, offset, range, stride);
+      break;
+   case 9:
+      gen9_fill_buffer_surface_state(state, format, offset, range, stride);
+      break;
+   default:
+      unreachable("unsupported gen\n");
+   }
+}
+
+void anv_DestroySampler(
+    VkDevice                                    _device,
+    VkSampler                                   _sampler,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_sampler, sampler, _sampler);
+
+   anv_free2(&device->alloc, pAllocator, sampler);
+}
+
+VkResult anv_CreateFramebuffer(
+    VkDevice                                    _device,
+    const VkFramebufferCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkFramebuffer*                              pFramebuffer)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_framebuffer *framebuffer;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO);
+
+   size_t size = sizeof(*framebuffer) +
+                 sizeof(struct anv_image_view *) * pCreateInfo->attachmentCount;
+   framebuffer = anv_alloc2(&device->alloc, pAllocator, size, 8,
+                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (framebuffer == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   framebuffer->attachment_count = pCreateInfo->attachmentCount;
+   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+      VkImageView _iview = pCreateInfo->pAttachments[i];
+      framebuffer->attachments[i] = anv_image_view_from_handle(_iview);
+   }
+
+   framebuffer->width = pCreateInfo->width;
+   framebuffer->height = pCreateInfo->height;
+   framebuffer->layers = pCreateInfo->layers;
+
+   *pFramebuffer = anv_framebuffer_to_handle(framebuffer);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyFramebuffer(
+    VkDevice                                    _device,
+    VkFramebuffer                               _fb,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_framebuffer, fb, _fb);
+
+   anv_free2(&device->alloc, pAllocator, fb);
+}
+
+void vkCmdDbgMarkerBegin(
+    VkCommandBuffer                              commandBuffer,
+    const char*                                 pMarker)
+   __attribute__ ((visibility ("default")));
+
+void vkCmdDbgMarkerEnd(
+   VkCommandBuffer                              commandBuffer)
+   __attribute__ ((visibility ("default")));
+
+void vkCmdDbgMarkerBegin(
+    VkCommandBuffer                              commandBuffer,
+    const char*                                 pMarker)
+{
+}
+
+void vkCmdDbgMarkerEnd(
+    VkCommandBuffer                              commandBuffer)
+{
+}
diff --git a/src/vulkan/anv_dump.c b/src/vulkan/anv_dump.c

new file mode 100644 (file)

index 0000000..b7fa28b
--- /dev/null
+++ b/src/vulkan/anv_dump.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+/* This file contains utility functions for help debugging.  They can be
+ * called from GDB or similar to help inspect images and buffers.
+ */
+
+void
+anv_dump_image_to_ppm(struct anv_device *device,
+                      struct anv_image *image, unsigned miplevel,
+                      unsigned array_layer, const char *filename)
+{
+   VkDevice vk_device = anv_device_to_handle(device);
+   VkResult result;
+
+   VkExtent2D extent = { image->extent.width, image->extent.height };
+   for (unsigned i = 0; i < miplevel; i++) {
+      extent.width = MAX2(1, extent.width / 2);
+      extent.height = MAX2(1, extent.height / 2);
+   }
+
+   VkImage copy_image;
+   result = anv_CreateImage(vk_device,
+      &(VkImageCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+         .imageType = VK_IMAGE_TYPE_2D,
+         .format = VK_FORMAT_R8G8B8A8_UNORM,
+         .extent = (VkExtent3D) { extent.width, extent.height, 1 },
+         .mipLevels = 1,
+         .arrayLayers = 1,
+         .samples = 1,
+         .tiling = VK_IMAGE_TILING_LINEAR,
+         .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+         .flags = 0,
+      }, NULL, &copy_image);
+   assert(result == VK_SUCCESS);
+
+   VkMemoryRequirements reqs;
+   anv_GetImageMemoryRequirements(vk_device, copy_image, &reqs);
+
+   VkDeviceMemory memory;
+   result = anv_AllocateMemory(vk_device,
+      &(VkMemoryAllocateInfo) {
+         .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+         .allocationSize = reqs.size,
+         .memoryTypeIndex = 0,
+      }, NULL, &memory);
+   assert(result == VK_SUCCESS);
+
+   result = anv_BindImageMemory(vk_device, copy_image, memory, 0);
+   assert(result == VK_SUCCESS);
+
+   VkCommandPool commandPool;
+   result = anv_CreateCommandPool(vk_device,
+      &(VkCommandPoolCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+         .queueFamilyIndex = 0,
+         .flags = 0,
+      }, NULL, &commandPool);
+   assert(result == VK_SUCCESS);
+
+   VkCommandBuffer cmd;
+   result = anv_AllocateCommandBuffers(vk_device,
+      &(VkCommandBufferAllocateInfo) {
+         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+         .commandPool = commandPool,
+         .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+         .commandBufferCount = 1,
+      }, &cmd);
+   assert(result == VK_SUCCESS);
+
+   result = anv_BeginCommandBuffer(cmd,
+      &(VkCommandBufferBeginInfo) {
+         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+         .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+      });
+   assert(result == VK_SUCCESS);
+
+   anv_CmdBlitImage(cmd,
+      anv_image_to_handle(image), VK_IMAGE_LAYOUT_GENERAL,
+      copy_image, VK_IMAGE_LAYOUT_GENERAL, 1,
+      &(VkImageBlit) {
+         .srcSubresource = {
+            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+            .mipLevel = miplevel,
+            .baseArrayLayer = array_layer,
+            .layerCount = 1,
+         },
+         .srcOffsets = {
+            { 0, 0, 0 },
+            { extent.width, extent.height, 1 },
+         },
+         .dstSubresource = {
+            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+            .mipLevel = 0,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+         },
+         .dstOffsets = {
+            { 0, 0, 0 },
+            { extent.width, extent.height, 1 },
+         },
+      }, VK_FILTER_NEAREST);
+
+   ANV_CALL(CmdPipelineBarrier)(cmd,
+      VK_PIPELINE_STAGE_TRANSFER_BIT,
+      VK_PIPELINE_STAGE_TRANSFER_BIT,
+      true, 0, NULL, 0, NULL, 1,
+      &(VkImageMemoryBarrier) {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+         .srcAccessMask = VK_ACCESS_HOST_READ_BIT,
+         .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+         .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
+         .newLayout = VK_IMAGE_LAYOUT_GENERAL,
+         .srcQueueFamilyIndex = 0,
+         .dstQueueFamilyIndex = 0,
+         .image = copy_image,
+         .subresourceRange = (VkImageSubresourceRange) {
+            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+         },
+      });
+
+   result = anv_EndCommandBuffer(cmd);
+   assert(result == VK_SUCCESS);
+
+   VkFence fence;
+   result = anv_CreateFence(vk_device,
+      &(VkFenceCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+         .flags = 0,
+      }, NULL, &fence);
+   assert(result == VK_SUCCESS);
+
+   result = anv_QueueSubmit(anv_queue_to_handle(&device->queue), 1,
+      &(VkSubmitInfo) {
+         .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+         .commandBufferCount = 1,
+         .pCommandBuffers = &cmd,
+      }, fence);
+   assert(result == VK_SUCCESS);
+
+   result = anv_WaitForFences(vk_device, 1, &fence, true, UINT64_MAX);
+   assert(result == VK_SUCCESS);
+
+   anv_DestroyFence(vk_device, fence, NULL);
+   anv_DestroyCommandPool(vk_device, commandPool, NULL);
+
+   uint8_t *map;
+   result = anv_MapMemory(vk_device, memory, 0, reqs.size, 0, (void **)&map);
+   assert(result == VK_SUCCESS);
+
+   VkSubresourceLayout layout;
+   anv_GetImageSubresourceLayout(vk_device, copy_image,
+      &(VkImageSubresource) {
+         .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+         .mipLevel = 0,
+         .arrayLayer = 0,
+      }, &layout);
+
+   map += layout.offset;
+
+   /* Now we can finally write the PPM file */
+   FILE *file = fopen(filename, "wb");
+   assert(file);
+
+   fprintf(file, "P6\n%d %d\n255\n", extent.width, extent.height);
+   for (unsigned y = 0; y < extent.height; y++) {
+      uint8_t row[extent.width * 3];
+      for (unsigned x = 0; x < extent.width; x++) {
+         row[x * 3 + 0] = map[x * 4 + 0];
+         row[x * 3 + 1] = map[x * 4 + 1];
+         row[x * 3 + 2] = map[x * 4 + 2];
+      }
+      fwrite(row, 3, extent.width, file);
+
+      map += layout.rowPitch;
+   }
+   fclose(file);
+
+   anv_UnmapMemory(vk_device, memory);
+   anv_DestroyImage(vk_device, copy_image, NULL);
+   anv_FreeMemory(vk_device, memory, NULL);
+}
diff --git a/src/vulkan/anv_entrypoints_gen.py b/src/vulkan/anv_entrypoints_gen.py

new file mode 100644 (file)

index 0000000..1e4cfcb
--- /dev/null
+++ b/src/vulkan/anv_entrypoints_gen.py
@@ -0,0 +1,324 @@
+# coding=utf-8
+#
+# Copyright © 2015 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+
+import fileinput, re, sys
+
+# Each function typedef in the vulkan.h header is all on one line and matches
+# this regepx. We hope that won't change.
+
+p = re.compile('typedef ([^ ]*) *\((?:VKAPI_PTR)? *\*PFN_vk([^(]*)\)(.*);')
+
+entrypoints = []
+
+# We generate a static hash table for entry point lookup
+# (vkGetProcAddress). We use a linear congruential generator for our hash
+# function and a power-of-two size table. The prime numbers are determined
+# experimentally.
+
+none = 0xffff
+hash_size = 256
+u32_mask = 2**32 - 1
+hash_mask = hash_size - 1
+
+prime_factor = 5024183
+prime_step = 19
+
+def hash(name):
+    h = 0;
+    for c in name:
+        h = (h * prime_factor + ord(c)) & u32_mask
+
+    return h
+
+opt_header = False
+opt_code = False
+
+if (sys.argv[1] == "header"):
+    opt_header = True
+    sys.argv.pop()
+elif (sys.argv[1] == "code"):
+    opt_code = True
+    sys.argv.pop()
+
+# Parse the entry points in the header
+
+i = 0
+for line in fileinput.input():
+    m  = p.match(line)
+    if (m):
+        if m.group(2) == 'VoidFunction':
+            continue
+        fullname = "vk" + m.group(2)
+        h = hash(fullname)
+        entrypoints.append((m.group(1), m.group(2), m.group(3), i, h))
+        i = i + 1
+
+# For outputting entrypoints.h we generate a anv_EntryPoint() prototype
+# per entry point.
+
+if opt_header:
+    print "/* This file generated from vk_gen.py, don't edit directly. */\n"
+
+    print "struct anv_dispatch_table {"
+    print "   union {"
+    print "      void *entrypoints[%d];" % len(entrypoints)
+    print "      struct {"
+
+    for type, name, args, num, h in entrypoints:
+        print "         %s (*%s)%s;" % (type, name, args)
+    print "      };\n"
+    print "   };\n"
+    print "};\n"
+
+    print "void anv_set_dispatch_devinfo(const struct brw_device_info *info);\n"
+
+    for type, name, args, num, h in entrypoints:
+        print "%s anv_%s%s;" % (type, name, args)
+        print "%s gen7_%s%s;" % (type, name, args)
+        print "%s gen75_%s%s;" % (type, name, args)
+        print "%s gen8_%s%s;" % (type, name, args)
+        print "%s gen9_%s%s;" % (type, name, args)
+        print "%s anv_validate_%s%s;" % (type, name, args)
+    exit()
+
+
+
+print """/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* DO NOT EDIT! This is a generated file. */
+
+#include "anv_private.h"
+
+struct anv_entrypoint {
+   uint32_t name;
+   uint32_t hash;
+};
+
+/* We use a big string constant to avoid lots of reloctions from the entry
+ * point table to lots of little strings. The entries in the entry point table
+ * store the index into this big string.
+ */
+
+static const char strings[] ="""
+
+offsets = []
+i = 0;
+for type, name, args, num, h in entrypoints:
+    print "   \"vk%s\\0\"" % name
+    offsets.append(i)
+    i += 2 + len(name) + 1
+print """   ;
+
+/* Weak aliases for all potential validate functions. These will resolve to
+ * NULL if they're not defined, which lets the resolve_entrypoint() function
+ * either pick a validate wrapper if available or just plug in the actual
+ * entry point.
+ */
+"""
+
+# Now generate the table of all entry points and their validation functions
+
+print "\nstatic const struct anv_entrypoint entrypoints[] = {"
+for type, name, args, num, h in entrypoints:
+    print "   { %5d, 0x%08x }," % (offsets[num], h)
+print "};\n"
+
+for layer in [ "anv", "validate", "gen7", "gen75", "gen8", "gen9" ]:
+    for type, name, args, num, h in entrypoints:
+        print "%s %s_%s%s __attribute__ ((weak));" % (type, layer, name, args)
+    print "\nconst struct anv_dispatch_table %s_layer = {" % layer
+    for type, name, args, num, h in entrypoints:
+        print "   .%s = %s_%s," % (name, layer, name)
+    print "};\n"
+
+print """
+#ifdef DEBUG
+static bool enable_validate = true;
+#else
+static bool enable_validate = false;
+#endif
+
+/* We can't use symbols that need resolving (like, oh, getenv) in the resolve
+ * function. This means that we have to determine whether or not to use the
+ * validation layer sometime before that. The constructor function attribute asks
+ * the dynamic linker to invoke determine_validate() at dlopen() time which
+ * works.
+ */
+static void __attribute__ ((constructor))
+determine_validate(void)
+{
+   const char *s = getenv("ANV_VALIDATE");
+
+   if (s)
+      enable_validate = atoi(s);
+}
+
+static const struct brw_device_info *dispatch_devinfo;
+
+void
+anv_set_dispatch_devinfo(const struct brw_device_info *devinfo)
+{
+   dispatch_devinfo = devinfo;
+}
+
+void * __attribute__ ((noinline))
+anv_resolve_entrypoint(uint32_t index)
+{
+   if (enable_validate && validate_layer.entrypoints[index])
+      return validate_layer.entrypoints[index];
+
+   if (dispatch_devinfo == NULL) {
+      assert(anv_layer.entrypoints[index]);
+      return anv_layer.entrypoints[index];
+   }
+
+   switch (dispatch_devinfo->gen) {
+   case 9:
+      if (gen9_layer.entrypoints[index])
+         return gen9_layer.entrypoints[index];
+      /* fall through */
+   case 8:
+      if (gen8_layer.entrypoints[index])
+         return gen8_layer.entrypoints[index];
+      /* fall through */
+   case 7:
+      if (dispatch_devinfo->is_haswell && gen75_layer.entrypoints[index])
+         return gen75_layer.entrypoints[index];
+
+      if (gen7_layer.entrypoints[index])
+         return gen7_layer.entrypoints[index];
+      /* fall through */
+   case 0:
+      return anv_layer.entrypoints[index];
+   default:
+      unreachable("unsupported gen\\n");
+   }
+}
+"""
+
+# Now output ifuncs and their resolve helpers for all entry points. The
+# resolve helper calls resolve_entrypoint() with the entry point index, which
+# lets the resolver look it up in the table.
+
+for type, name, args, num, h in entrypoints:
+    print "static void *resolve_%s(void) { return anv_resolve_entrypoint(%d); }" % (name, num)
+    print "%s vk%s%s\n   __attribute__ ((ifunc (\"resolve_%s\"), visibility (\"default\")));\n" % (type, name, args, name)
+
+
+# Now generate the hash table used for entry point look up.  This is a
+# uint16_t table of entry point indices. We use 0xffff to indicate an entry
+# in the hash table is empty.
+
+map = [none for f in xrange(hash_size)]
+collisions = [0 for f in xrange(10)]
+for type, name, args, num, h in entrypoints:
+    level = 0
+    while map[h & hash_mask] != none:
+        h = h + prime_step
+        level = level + 1
+    if level > 9:
+        collisions[9] += 1
+    else:
+        collisions[level] += 1
+    map[h & hash_mask] = num
+
+print "/* Hash table stats:"
+print " * size %d entries" % hash_size
+print " * collisions  entries"
+for i in xrange(10):
+    if (i == 9):
+        plus = "+"
+    else:
+        plus = " "
+
+    print " *     %2d%s     %4d" % (i, plus, collisions[i])
+print " */\n"
+
+print "#define none 0x%04x\n" % none
+
+print "static const uint16_t map[] = {"
+for i in xrange(0, hash_size, 8):
+    print "   ",
+    for j in xrange(i, i + 8):
+        if map[j] & 0xffff == 0xffff:
+            print "  none,",
+        else:
+            print "0x%04x," % (map[j] & 0xffff),
+    print
+
+print "};"    
+
+# Finally we generate the hash table lookup function.  The hash function and
+# linear probing algorithm matches the hash table generated above.
+
+print """
+void *
+anv_lookup_entrypoint(const char *name)
+{
+   static const uint32_t prime_factor = %d;
+   static const uint32_t prime_step = %d;
+   const struct anv_entrypoint *e;
+   uint32_t hash, h, i;
+   const char *p;
+
+   hash = 0;
+   for (p = name; *p; p++)
+      hash = hash * prime_factor + *p;
+
+   h = hash;
+   do {
+      i = map[h & %d];
+      if (i == none)
+         return NULL;
+      e = &entrypoints[i];
+      h += prime_step;
+   } while (e->hash != hash);
+
+   if (strcmp(name, strings + e->name) != 0)
+      return NULL;
+
+   return anv_resolve_entrypoint(i);
+}
+""" % (prime_factor, prime_step, hash_mask)
diff --git a/src/vulkan/anv_formats.c b/src/vulkan/anv_formats.c

new file mode 100644 (file)

index 0000000..d480ee7
--- /dev/null
+++ b/src/vulkan/anv_formats.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "brw_surface_formats.h"
+
+#include "gen7_pack.h"
+
+#define fmt(__vk_fmt, __hw_fmt, ...) \
+   [__vk_fmt] = { \
+      .vk_format = __vk_fmt, \
+      .name = #__vk_fmt, \
+      .surface_format = __hw_fmt, \
+      .isl_layout = &isl_format_layouts[__hw_fmt], \
+      __VA_ARGS__ \
+   }
+
+static const struct anv_format anv_formats[] = {
+   fmt(VK_FORMAT_UNDEFINED,               ISL_FORMAT_RAW),
+   fmt(VK_FORMAT_R4G4_UNORM_PACK8,        ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_R4G4B4A4_UNORM_PACK16,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B4G4R4A4_UNORM_PACK16,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_R5G6B5_UNORM_PACK16,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B5G6R5_UNORM_PACK16,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_R5G5B5A1_UNORM_PACK16,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B5G5R5A1_UNORM_PACK16,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_A1R5G5B5_UNORM_PACK16,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_R8_UNORM,                ISL_FORMAT_R8_UNORM),
+   fmt(VK_FORMAT_R8_SNORM,                ISL_FORMAT_R8_SNORM),
+   fmt(VK_FORMAT_R8_USCALED,              ISL_FORMAT_R8_USCALED),
+   fmt(VK_FORMAT_R8_SSCALED,              ISL_FORMAT_R8_SSCALED),
+   fmt(VK_FORMAT_R8_UINT,                 ISL_FORMAT_R8_UINT),
+   fmt(VK_FORMAT_R8_SINT,                 ISL_FORMAT_R8_SINT),
+   fmt(VK_FORMAT_R8_SRGB,                 ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_R8G8_UNORM,              ISL_FORMAT_R8G8_UNORM),
+   fmt(VK_FORMAT_R8G8_SNORM,              ISL_FORMAT_R8G8_SNORM),
+   fmt(VK_FORMAT_R8G8_USCALED,            ISL_FORMAT_R8G8_USCALED),
+   fmt(VK_FORMAT_R8G8_SSCALED,            ISL_FORMAT_R8G8_SSCALED),
+   fmt(VK_FORMAT_R8G8_UINT,               ISL_FORMAT_R8G8_UINT),
+   fmt(VK_FORMAT_R8G8_SINT,               ISL_FORMAT_R8G8_SINT),
+   fmt(VK_FORMAT_R8G8_SRGB,               ISL_FORMAT_UNSUPPORTED), /* L8A8_UNORM_SRGB */
+   fmt(VK_FORMAT_R8G8B8_UNORM,            ISL_FORMAT_R8G8B8_UNORM),
+   fmt(VK_FORMAT_R8G8B8_SNORM,            ISL_FORMAT_R8G8B8_SNORM),
+   fmt(VK_FORMAT_R8G8B8_USCALED,          ISL_FORMAT_R8G8B8_USCALED),
+   fmt(VK_FORMAT_R8G8B8_SSCALED,          ISL_FORMAT_R8G8B8_SSCALED),
+   fmt(VK_FORMAT_R8G8B8_UINT,             ISL_FORMAT_R8G8B8_UINT),
+   fmt(VK_FORMAT_R8G8B8_SINT,             ISL_FORMAT_R8G8B8_SINT),
+   fmt(VK_FORMAT_R8G8B8_SRGB,             ISL_FORMAT_UNSUPPORTED), /* B8G8R8A8_UNORM_SRGB */
+   fmt(VK_FORMAT_R8G8B8A8_UNORM,          ISL_FORMAT_R8G8B8A8_UNORM),
+   fmt(VK_FORMAT_R8G8B8A8_SNORM,          ISL_FORMAT_R8G8B8A8_SNORM),
+   fmt(VK_FORMAT_R8G8B8A8_USCALED,        ISL_FORMAT_R8G8B8A8_USCALED),
+   fmt(VK_FORMAT_R8G8B8A8_SSCALED,        ISL_FORMAT_R8G8B8A8_SSCALED),
+   fmt(VK_FORMAT_R8G8B8A8_UINT,           ISL_FORMAT_R8G8B8A8_UINT),
+   fmt(VK_FORMAT_R8G8B8A8_SINT,           ISL_FORMAT_R8G8B8A8_SINT),
+   fmt(VK_FORMAT_R8G8B8A8_SRGB,           ISL_FORMAT_R8G8B8A8_UNORM_SRGB),
+   fmt(VK_FORMAT_A8B8G8R8_UNORM_PACK32,   ISL_FORMAT_R8G8B8A8_UNORM),
+   fmt(VK_FORMAT_A8B8G8R8_SNORM_PACK32,   ISL_FORMAT_R8G8B8A8_SNORM),
+   fmt(VK_FORMAT_A8B8G8R8_USCALED_PACK32, ISL_FORMAT_R8G8B8A8_USCALED),
+   fmt(VK_FORMAT_A8B8G8R8_SSCALED_PACK32, ISL_FORMAT_R8G8B8A8_SSCALED),
+   fmt(VK_FORMAT_A8B8G8R8_UINT_PACK32,    ISL_FORMAT_R8G8B8A8_UINT),
+   fmt(VK_FORMAT_A8B8G8R8_SINT_PACK32,    ISL_FORMAT_R8G8B8A8_SINT),
+   fmt(VK_FORMAT_A8B8G8R8_SRGB_PACK32,    ISL_FORMAT_R8G8B8A8_UNORM_SRGB),
+   fmt(VK_FORMAT_A2R10G10B10_UNORM_PACK32, ISL_FORMAT_B10G10R10A2_UNORM),
+   fmt(VK_FORMAT_A2R10G10B10_SNORM_PACK32, ISL_FORMAT_B10G10R10A2_SNORM),
+   fmt(VK_FORMAT_A2R10G10B10_USCALED_PACK32, ISL_FORMAT_B10G10R10A2_USCALED),
+   fmt(VK_FORMAT_A2R10G10B10_SSCALED_PACK32, ISL_FORMAT_B10G10R10A2_SSCALED),
+   fmt(VK_FORMAT_A2R10G10B10_UINT_PACK32, ISL_FORMAT_B10G10R10A2_UINT),
+   fmt(VK_FORMAT_A2R10G10B10_SINT_PACK32, ISL_FORMAT_B10G10R10A2_SINT),
+   fmt(VK_FORMAT_A2B10G10R10_UNORM_PACK32, ISL_FORMAT_R10G10B10A2_UNORM),
+   fmt(VK_FORMAT_A2B10G10R10_SNORM_PACK32, ISL_FORMAT_R10G10B10A2_SNORM),
+   fmt(VK_FORMAT_A2B10G10R10_USCALED_PACK32, ISL_FORMAT_R10G10B10A2_USCALED),
+   fmt(VK_FORMAT_A2B10G10R10_SSCALED_PACK32, ISL_FORMAT_R10G10B10A2_SSCALED),
+   fmt(VK_FORMAT_A2B10G10R10_UINT_PACK32, ISL_FORMAT_R10G10B10A2_UINT),
+   fmt(VK_FORMAT_A2B10G10R10_SINT_PACK32, ISL_FORMAT_R10G10B10A2_SINT),
+   fmt(VK_FORMAT_R16_UNORM,               ISL_FORMAT_R16_UNORM),
+   fmt(VK_FORMAT_R16_SNORM,               ISL_FORMAT_R16_SNORM),
+   fmt(VK_FORMAT_R16_USCALED,             ISL_FORMAT_R16_USCALED),
+   fmt(VK_FORMAT_R16_SSCALED,             ISL_FORMAT_R16_SSCALED),
+   fmt(VK_FORMAT_R16_UINT,                ISL_FORMAT_R16_UINT),
+   fmt(VK_FORMAT_R16_SINT,                ISL_FORMAT_R16_SINT),
+   fmt(VK_FORMAT_R16_SFLOAT,              ISL_FORMAT_R16_FLOAT),
+   fmt(VK_FORMAT_R16G16_UNORM,            ISL_FORMAT_R16G16_UNORM),
+   fmt(VK_FORMAT_R16G16_SNORM,            ISL_FORMAT_R16G16_SNORM),
+   fmt(VK_FORMAT_R16G16_USCALED,          ISL_FORMAT_R16G16_USCALED),
+   fmt(VK_FORMAT_R16G16_SSCALED,          ISL_FORMAT_R16G16_SSCALED),
+   fmt(VK_FORMAT_R16G16_UINT,             ISL_FORMAT_R16G16_UINT),
+   fmt(VK_FORMAT_R16G16_SINT,             ISL_FORMAT_R16G16_SINT),
+   fmt(VK_FORMAT_R16G16_SFLOAT,           ISL_FORMAT_R16G16_FLOAT),
+   fmt(VK_FORMAT_R16G16B16_UNORM,         ISL_FORMAT_R16G16B16_UNORM),
+   fmt(VK_FORMAT_R16G16B16_SNORM,         ISL_FORMAT_R16G16B16_SNORM),
+   fmt(VK_FORMAT_R16G16B16_USCALED,       ISL_FORMAT_R16G16B16_USCALED),
+   fmt(VK_FORMAT_R16G16B16_SSCALED,       ISL_FORMAT_R16G16B16_SSCALED),
+   fmt(VK_FORMAT_R16G16B16_UINT,          ISL_FORMAT_R16G16B16_UINT),
+   fmt(VK_FORMAT_R16G16B16_SINT,          ISL_FORMAT_R16G16B16_SINT),
+   fmt(VK_FORMAT_R16G16B16_SFLOAT,        ISL_FORMAT_R16G16B16_FLOAT),
+   fmt(VK_FORMAT_R16G16B16A16_UNORM,      ISL_FORMAT_R16G16B16A16_UNORM),
+   fmt(VK_FORMAT_R16G16B16A16_SNORM,      ISL_FORMAT_R16G16B16A16_SNORM),
+   fmt(VK_FORMAT_R16G16B16A16_USCALED,    ISL_FORMAT_R16G16B16A16_USCALED),
+   fmt(VK_FORMAT_R16G16B16A16_SSCALED,    ISL_FORMAT_R16G16B16A16_SSCALED),
+   fmt(VK_FORMAT_R16G16B16A16_UINT,       ISL_FORMAT_R16G16B16A16_UINT),
+   fmt(VK_FORMAT_R16G16B16A16_SINT,       ISL_FORMAT_R16G16B16A16_SINT),
+   fmt(VK_FORMAT_R16G16B16A16_SFLOAT,     ISL_FORMAT_R16G16B16A16_FLOAT),
+   fmt(VK_FORMAT_R32_UINT,                ISL_FORMAT_R32_UINT,),
+   fmt(VK_FORMAT_R32_SINT,                ISL_FORMAT_R32_SINT,),
+   fmt(VK_FORMAT_R32_SFLOAT,              ISL_FORMAT_R32_FLOAT,),
+   fmt(VK_FORMAT_R32G32_UINT,             ISL_FORMAT_R32G32_UINT,),
+   fmt(VK_FORMAT_R32G32_SINT,             ISL_FORMAT_R32G32_SINT,),
+   fmt(VK_FORMAT_R32G32_SFLOAT,           ISL_FORMAT_R32G32_FLOAT,),
+   fmt(VK_FORMAT_R32G32B32_UINT,          ISL_FORMAT_R32G32B32_UINT,),
+   fmt(VK_FORMAT_R32G32B32_SINT,          ISL_FORMAT_R32G32B32_SINT,),
+   fmt(VK_FORMAT_R32G32B32_SFLOAT,        ISL_FORMAT_R32G32B32_FLOAT,),
+   fmt(VK_FORMAT_R32G32B32A32_UINT,       ISL_FORMAT_R32G32B32A32_UINT,),
+   fmt(VK_FORMAT_R32G32B32A32_SINT,       ISL_FORMAT_R32G32B32A32_SINT,),
+   fmt(VK_FORMAT_R32G32B32A32_SFLOAT,     ISL_FORMAT_R32G32B32A32_FLOAT,),
+   fmt(VK_FORMAT_R64_UINT,                ISL_FORMAT_R64_PASSTHRU),
+   fmt(VK_FORMAT_R64_SINT,                ISL_FORMAT_R64_PASSTHRU),
+   fmt(VK_FORMAT_R64_SFLOAT,              ISL_FORMAT_R64_FLOAT),
+   fmt(VK_FORMAT_R64G64_UINT,             ISL_FORMAT_R64G64_PASSTHRU),
+   fmt(VK_FORMAT_R64G64_SINT,             ISL_FORMAT_R64G64_PASSTHRU),
+   fmt(VK_FORMAT_R64G64_SFLOAT,           ISL_FORMAT_R64G64_FLOAT),
+   fmt(VK_FORMAT_R64G64B64_UINT,          ISL_FORMAT_R64G64B64_PASSTHRU),
+   fmt(VK_FORMAT_R64G64B64_SINT,          ISL_FORMAT_R64G64B64_PASSTHRU),
+   fmt(VK_FORMAT_R64G64B64_SFLOAT,        ISL_FORMAT_R64G64B64_FLOAT),
+   fmt(VK_FORMAT_R64G64B64A64_UINT,       ISL_FORMAT_R64G64B64A64_PASSTHRU),
+   fmt(VK_FORMAT_R64G64B64A64_SINT,       ISL_FORMAT_R64G64B64A64_PASSTHRU),
+   fmt(VK_FORMAT_R64G64B64A64_SFLOAT,     ISL_FORMAT_R64G64B64A64_FLOAT),
+   fmt(VK_FORMAT_B10G11R11_UFLOAT_PACK32, ISL_FORMAT_R11G11B10_FLOAT),
+   fmt(VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,  ISL_FORMAT_R9G9B9E5_SHAREDEXP),
+
+   fmt(VK_FORMAT_D16_UNORM,               ISL_FORMAT_R16_UNORM,               .depth_format = D16_UNORM),
+   fmt(VK_FORMAT_X8_D24_UNORM_PACK32,     ISL_FORMAT_R24_UNORM_X8_TYPELESS,   .depth_format = D24_UNORM_X8_UINT),
+   fmt(VK_FORMAT_D32_SFLOAT,              ISL_FORMAT_R32_FLOAT,               .depth_format = D32_FLOAT),
+   fmt(VK_FORMAT_S8_UINT,                 ISL_FORMAT_R8_UINT,                                                       .has_stencil = true),
+   fmt(VK_FORMAT_D16_UNORM_S8_UINT,       ISL_FORMAT_R16_UNORM,               .depth_format = D16_UNORM,            .has_stencil = true),
+   fmt(VK_FORMAT_D24_UNORM_S8_UINT,       ISL_FORMAT_R24_UNORM_X8_TYPELESS,   .depth_format = D24_UNORM_X8_UINT,    .has_stencil = true),
+   fmt(VK_FORMAT_D32_SFLOAT_S8_UINT,      ISL_FORMAT_R32_FLOAT,               .depth_format = D32_FLOAT,            .has_stencil = true),
+
+   fmt(VK_FORMAT_BC1_RGB_UNORM_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC1_RGB_SRGB_BLOCK,      ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC1_RGBA_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC1_RGBA_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC2_UNORM_BLOCK,         ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC2_SRGB_BLOCK,          ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC3_UNORM_BLOCK,         ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC3_SRGB_BLOCK,          ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC4_UNORM_BLOCK,         ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC4_SNORM_BLOCK,         ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC5_UNORM_BLOCK,         ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC5_SNORM_BLOCK,         ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC6H_UFLOAT_BLOCK,       ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC6H_SFLOAT_BLOCK,       ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC7_UNORM_BLOCK,         ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_BC7_SRGB_BLOCK,          ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK, ISL_FORMAT_ETC2_RGB8),
+   fmt(VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK,  ISL_FORMAT_ETC2_SRGB8),
+   fmt(VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK, ISL_FORMAT_ETC2_RGB8_PTA),
+   fmt(VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK, ISL_FORMAT_ETC2_SRGB8_PTA),
+   fmt(VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK, ISL_FORMAT_ETC2_EAC_RGBA8),
+   fmt(VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK, ISL_FORMAT_ETC2_EAC_SRGB8_A8),
+   fmt(VK_FORMAT_EAC_R11_UNORM_BLOCK,     ISL_FORMAT_EAC_R11),
+   fmt(VK_FORMAT_EAC_R11_SNORM_BLOCK,     ISL_FORMAT_EAC_SIGNED_R11),
+   fmt(VK_FORMAT_EAC_R11G11_UNORM_BLOCK,  ISL_FORMAT_EAC_RG11),
+   fmt(VK_FORMAT_EAC_R11G11_SNORM_BLOCK,  ISL_FORMAT_EAC_SIGNED_RG11),
+   fmt(VK_FORMAT_ASTC_4x4_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_4x4_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_5x4_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_5x4_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_5x5_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_5x5_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_6x5_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_6x5_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_6x6_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_6x6_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_8x5_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_8x5_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_8x6_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_8x6_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_8x8_UNORM_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_8x8_SRGB_BLOCK,     ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_10x5_UNORM_BLOCK,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_10x5_SRGB_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_10x6_UNORM_BLOCK,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_10x6_SRGB_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_10x8_UNORM_BLOCK,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_10x8_SRGB_BLOCK,    ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_10x10_UNORM_BLOCK,  ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_10x10_SRGB_BLOCK,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_12x10_UNORM_BLOCK,  ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_12x10_SRGB_BLOCK,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_12x12_UNORM_BLOCK,  ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_ASTC_12x12_SRGB_BLOCK,   ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8_UNORM,            ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8_SNORM,            ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8_USCALED,          ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8_SSCALED,          ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8_UINT,             ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8_SINT,             ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8_SRGB,             ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8A8_UNORM,          ISL_FORMAT_B8G8R8A8_UNORM),
+   fmt(VK_FORMAT_B8G8R8A8_SNORM,          ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8A8_USCALED,        ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8A8_SSCALED,        ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8A8_UINT,           ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8A8_SINT,           ISL_FORMAT_UNSUPPORTED),
+   fmt(VK_FORMAT_B8G8R8A8_SRGB,           ISL_FORMAT_B8G8R8A8_UNORM_SRGB),
+};
+
+#undef fmt
+
+const struct anv_format *
+anv_format_for_vk_format(VkFormat format)
+{
+   return &anv_formats[format];
+}
+
+/**
+ * Exactly one bit must be set in \a aspect.
+ */
+enum isl_format
+anv_get_isl_format(VkFormat format, VkImageAspectFlags aspect,
+                   VkImageTiling tiling)
+{
+   const struct anv_format *anv_fmt = &anv_formats[format];
+
+   switch (aspect) {
+   case VK_IMAGE_ASPECT_COLOR_BIT:
+      if (anv_fmt->surface_format == ISL_FORMAT_UNSUPPORTED) {
+         return ISL_FORMAT_UNSUPPORTED;
+      } else if (tiling == VK_IMAGE_TILING_OPTIMAL &&
+                 !util_is_power_of_two(anv_fmt->isl_layout->bs)) {
+         /* Tiled formats *must* be power-of-two because we need up upload
+          * them with the render pipeline.  For 3-channel formats, we fix
+          * this by switching them over to RGBX or RGBA formats under the
+          * hood.
+          */
+         enum isl_format rgbx = isl_format_rgb_to_rgbx(anv_fmt->surface_format);
+         if (rgbx != ISL_FORMAT_UNSUPPORTED)
+            return rgbx;
+         else
+            return isl_format_rgb_to_rgba(anv_fmt->surface_format);
+      } else {
+         return anv_fmt->surface_format;
+      }
+
+   case VK_IMAGE_ASPECT_DEPTH_BIT:
+   case (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT):
+      assert(anv_fmt->depth_format != 0);
+      return anv_fmt->surface_format;
+
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      assert(anv_fmt->has_stencil);
+      return ISL_FORMAT_R8_UINT;
+
+   default:
+      unreachable("bad VkImageAspect");
+      return ISL_FORMAT_UNSUPPORTED;
+   }
+}
+
+// Format capabilities
+
+void anv_validate_GetPhysicalDeviceFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    _format,
+    VkFormatProperties*                         pFormatProperties)
+{
+   const struct anv_format *format = anv_format_for_vk_format(_format);
+   fprintf(stderr, "vkGetFormatProperties(%s)\n", format->name);
+   anv_GetPhysicalDeviceFormatProperties(physicalDevice, _format, pFormatProperties);
+}
+
+static VkFormatFeatureFlags
+get_image_format_properties(int gen, enum isl_format base,
+                            enum isl_format actual)
+{
+   const struct brw_surface_format_info *info = &surface_formats[actual];
+
+   if (actual == ISL_FORMAT_UNSUPPORTED || !info->exists)
+      return 0;
+
+   VkFormatFeatureFlags flags = 0;
+   if (info->sampling <= gen) {
+      flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
+               VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+   }
+
+   if (info->render_target <= gen) {
+      flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+               VK_FORMAT_FEATURE_BLIT_DST_BIT;
+   }
+
+   if (info->alpha_blend <= gen)
+      flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+
+   /* Load/store is determined based on base format.  This prevents RGB
+    * formats from showing up as load/store capable.
+    */
+   if (isl_is_storage_image_format(base))
+      flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+
+   if (base == ISL_FORMAT_R32_SINT || base == ISL_FORMAT_R32_UINT)
+      flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
+
+   return flags;
+}
+
+static VkFormatFeatureFlags
+get_buffer_format_properties(int gen, enum isl_format format)
+{
+   const struct brw_surface_format_info *info = &surface_formats[format];
+
+   if (format == ISL_FORMAT_UNSUPPORTED || !info->exists)
+      return 0;
+
+   VkFormatFeatureFlags flags = 0;
+   if (info->sampling <= gen && !isl_format_is_compressed(format))
+      flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT;
+
+   if (info->input_vb <= gen)
+      flags |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
+
+   if (isl_is_storage_image_format(format))
+      flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+
+   if (format == ISL_FORMAT_R32_SINT || format == ISL_FORMAT_R32_UINT)
+      flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+
+   return flags;
+}
+
+static void
+anv_physical_device_get_format_properties(struct anv_physical_device *physical_device,
+                                          VkFormat format,
+                                          VkFormatProperties *out_properties)
+{
+   int gen = physical_device->info->gen * 10;
+   if (physical_device->info->is_haswell)
+      gen += 5;
+
+   VkFormatFeatureFlags linear = 0, tiled = 0, buffer = 0;
+   if (anv_format_is_depth_or_stencil(&anv_formats[format])) {
+      tiled |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT;
+      if (physical_device->info->gen >= 8) {
+         tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
+         tiled |= VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+      }
+      if (anv_formats[format].depth_format) {
+         tiled |= VK_FORMAT_FEATURE_BLIT_DST_BIT;
+      }
+   } else {
+      enum isl_format linear_fmt, tiled_fmt;
+      linear_fmt = anv_get_isl_format(format, VK_IMAGE_ASPECT_COLOR_BIT,
+                                      VK_IMAGE_TILING_LINEAR);
+      tiled_fmt = anv_get_isl_format(format, VK_IMAGE_ASPECT_COLOR_BIT,
+                                     VK_IMAGE_TILING_OPTIMAL);
+
+      linear = get_image_format_properties(gen, linear_fmt, linear_fmt);
+      tiled = get_image_format_properties(gen, linear_fmt, tiled_fmt);
+      buffer = get_buffer_format_properties(gen, linear_fmt);
+
+      /* XXX: We handle 3-channel formats by switching them out for RGBX or
+       * RGBA formats behind-the-scenes.  This works fine for textures
+       * because the upload process will fill in the extra channel.
+       * We could also support it for render targets, but it will take
+       * substantially more work and we have enough RGBX formats to handle
+       * what most clients will want.
+       */
+      if (linear_fmt != ISL_FORMAT_UNSUPPORTED &&
+          isl_format_is_rgb(linear_fmt) &&
+          isl_format_rgb_to_rgbx(linear_fmt) == ISL_FORMAT_UNSUPPORTED) {
+         tiled &= ~VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT &
+                  ~VK_FORMAT_FEATURE_BLIT_DST_BIT;
+      }
+   }
+
+   out_properties->linearTilingFeatures = linear;
+   out_properties->optimalTilingFeatures = tiled;
+   out_properties->bufferFeatures = buffer;
+
+   return;
+}
+
+
+void anv_GetPhysicalDeviceFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkFormatProperties*                         pFormatProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+
+   anv_physical_device_get_format_properties(
+               physical_device,
+               format,
+               pFormatProperties);
+}
+
+VkResult anv_GetPhysicalDeviceImageFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    VkImageTiling                               tiling,
+    VkImageUsageFlags                           usage,
+    VkImageCreateFlags                          flags,
+    VkImageFormatProperties*                    pImageFormatProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   VkFormatProperties format_props;
+   VkFormatFeatureFlags format_feature_flags;
+   VkExtent3D maxExtent;
+   uint32_t maxMipLevels;
+   uint32_t maxArraySize;
+
+   anv_physical_device_get_format_properties(physical_device, format,
+                                             &format_props);
+
+   /* Extract the VkFormatFeatureFlags that are relevant for the queried
+    * tiling.
+    */
+   if (tiling == VK_IMAGE_TILING_LINEAR) {
+      format_feature_flags = format_props.linearTilingFeatures;
+   } else if (tiling == VK_IMAGE_TILING_OPTIMAL) {
+      format_feature_flags = format_props.optimalTilingFeatures;
+   } else {
+      unreachable("bad VkImageTiling");
+   }
+
+   switch (type) {
+   default:
+      unreachable("bad VkImageType");
+   case VK_IMAGE_TYPE_1D:
+      maxExtent.width = 16384;
+      maxExtent.height = 1;
+      maxExtent.depth = 1;
+      maxMipLevels = 15; /* log2(maxWidth) + 1 */
+      maxArraySize = 2048;
+      break;
+   case VK_IMAGE_TYPE_2D:
+      /* FINISHME: Does this really differ for cube maps? The documentation
+       * for RENDER_SURFACE_STATE suggests so.
+       */
+      maxExtent.width = 16384;
+      maxExtent.height = 16384;
+      maxExtent.depth = 1;
+      maxMipLevels = 15; /* log2(maxWidth) + 1 */
+      maxArraySize = 2048;
+      break;
+   case VK_IMAGE_TYPE_3D:
+      maxExtent.width = 2048;
+      maxExtent.height = 2048;
+      maxExtent.depth = 2048;
+      maxMipLevels = 12; /* log2(maxWidth) + 1 */
+      maxArraySize = 1;
+      break;
+   }
+
+   if (usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+      /* Meta implements transfers by sampling from the source image. */
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
+      if (anv_format_for_vk_format(format)->has_stencil) {
+         /* Not yet implemented because copying to a W-tiled surface is crazy
+          * hard.
+          */
+         anv_finishme("support VK_IMAGE_USAGE_TRANSFER_DST_BIT for "
+                      "stencil format");
+         goto unsupported;
+      }
+   }
+
+   if (usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) {
+      /* Nothing to check. */
+   }
+
+   if (usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
+      /* Ignore this flag because it was removed from the
+       * provisional_I_20150910 header.
+       */
+   }
+
+   *pImageFormatProperties = (VkImageFormatProperties) {
+      .maxExtent = maxExtent,
+      .maxMipLevels = maxMipLevels,
+      .maxArrayLayers = maxArraySize,
+
+      /* FINISHME: Support multisampling */
+      .sampleCounts = VK_SAMPLE_COUNT_1_BIT,
+
+      /* FINISHME: Accurately calculate
+       * VkImageFormatProperties::maxResourceSize.
+       */
+      .maxResourceSize = UINT32_MAX,
+   };
+
+   return VK_SUCCESS;
+
+unsupported:
+   *pImageFormatProperties = (VkImageFormatProperties) {
+      .maxExtent = { 0, 0, 0 },
+      .maxMipLevels = 0,
+      .maxArrayLayers = 0,
+      .sampleCounts = 0,
+      .maxResourceSize = 0,
+   };
+
+   return VK_SUCCESS;
+}
+
+void anv_GetPhysicalDeviceSparseImageFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    uint32_t                                    samples,
+    VkImageUsageFlags                           usage,
+    VkImageTiling                               tiling,
+    uint32_t*                                   pNumProperties,
+    VkSparseImageFormatProperties*              pProperties)
+{
+   /* Sparse images are not yet supported. */
+   *pNumProperties = 0;
+}
diff --git a/src/vulkan/anv_gem.c b/src/vulkan/anv_gem.c

new file mode 100644 (file)

index 0000000..0a7be35
--- /dev/null
+++ b/src/vulkan/anv_gem.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define _DEFAULT_SOURCE
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#define VG_CLEAR(s) VG(memset(&s, 0, sizeof(s)))
+
+static int
+anv_ioctl(int fd, unsigned long request, void *arg)
+{
+   int ret;
+
+   do {
+      ret = ioctl(fd, request, arg);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+
+   return ret;
+}
+
+/**
+ * Wrapper around DRM_IOCTL_I915_GEM_CREATE.
+ *
+ * Return gem handle, or 0 on failure. Gem handles are never 0.
+ */
+uint32_t
+anv_gem_create(struct anv_device *device, size_t size)
+{
+   struct drm_i915_gem_create gem_create;
+   int ret;
+
+   VG_CLEAR(gem_create);
+   gem_create.size = size;
+
+   ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create);
+   if (ret != 0) {
+      /* FIXME: What do we do if this fails? */
+      return 0;
+   }
+
+   return gem_create.handle;
+}
+
+void
+anv_gem_close(struct anv_device *device, uint32_t gem_handle)
+{
+   struct drm_gem_close close;
+
+   VG_CLEAR(close);
+   close.handle = gem_handle;
+   anv_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+/**
+ * Wrapper around DRM_IOCTL_I915_GEM_MMAP.
+ */
+void*
+anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
+             uint64_t offset, uint64_t size, uint32_t flags)
+{
+   struct drm_i915_gem_mmap gem_mmap;
+   int ret;
+
+   gem_mmap.handle = gem_handle;
+   VG_CLEAR(gem_mmap.pad);
+   gem_mmap.offset = offset;
+   gem_mmap.size = size;
+   VG_CLEAR(gem_mmap.addr_ptr);
+   gem_mmap.flags = flags;
+
+   ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap);
+   if (ret != 0) {
+      /* FIXME: Is NULL the right error return? Cf MAP_INVALID */
+      return NULL;
+   }
+
+   VG(VALGRIND_MALLOCLIKE_BLOCK(gem_mmap.addr_ptr, gem_mmap.size, 0, 1));
+   return (void *)(uintptr_t) gem_mmap.addr_ptr;
+}
+
+/* This is just a wrapper around munmap, but it also notifies valgrind that
+ * this map is no longer valid.  Pair this with anv_gem_mmap().
+ */
+void
+anv_gem_munmap(void *p, uint64_t size)
+{
+   VG(VALGRIND_FREELIKE_BLOCK(p, 0));
+   munmap(p, size);
+}
+
+uint32_t
+anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
+{
+   struct drm_i915_gem_userptr userptr;
+   int ret;
+
+   VG_CLEAR(userptr);
+   userptr.user_ptr = (__u64)((unsigned long) mem);
+   userptr.user_size = size;
+   userptr.flags = 0;
+
+   ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_USERPTR, &userptr);
+   if (ret == -1)
+      return 0;
+
+   return userptr.handle;
+}
+
+int
+anv_gem_set_caching(struct anv_device *device,
+                    uint32_t gem_handle, uint32_t caching)
+{
+   struct drm_i915_gem_caching gem_caching;
+
+   VG_CLEAR(gem_caching);
+   gem_caching.handle = gem_handle;
+   gem_caching.caching = caching;
+
+   return anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &gem_caching);
+}
+
+int
+anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
+                   uint32_t read_domains, uint32_t write_domain)
+{
+   struct drm_i915_gem_set_domain gem_set_domain;
+
+   VG_CLEAR(gem_set_domain);
+   gem_set_domain.handle = gem_handle;
+   gem_set_domain.read_domains = read_domains;
+   gem_set_domain.write_domain = write_domain;
+
+   return anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &gem_set_domain);
+}
+
+/**
+ * On error, \a timeout_ns holds the remaining time.
+ */
+int
+anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns)
+{
+   struct drm_i915_gem_wait wait;
+   int ret;
+
+   VG_CLEAR(wait);
+   wait.bo_handle = gem_handle;
+   wait.timeout_ns = *timeout_ns;
+   wait.flags = 0;
+
+   ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+   *timeout_ns = wait.timeout_ns;
+
+   return ret;
+}
+
+int
+anv_gem_execbuffer(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf)
+{
+   return anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, execbuf);
+}
+
+int
+anv_gem_set_tiling(struct anv_device *device,
+                   uint32_t gem_handle, uint32_t stride, uint32_t tiling)
+{
+   struct drm_i915_gem_set_tiling set_tiling;
+   int ret;
+
+   /* set_tiling overwrites the input on the error path, so we have to open
+    * code anv_ioctl.
+    */
+
+   do {
+      VG_CLEAR(set_tiling);
+      set_tiling.handle = gem_handle;
+      set_tiling.tiling_mode = tiling;
+      set_tiling.stride = stride;
+
+      ret = ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+
+   return ret;
+}
+
+int
+anv_gem_get_param(int fd, uint32_t param)
+{
+   drm_i915_getparam_t gp;
+   int ret, tmp;
+
+   VG_CLEAR(gp);
+   gp.param = param;
+   gp.value = &tmp;
+   ret = anv_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
+   if (ret == 0)
+      return tmp;
+
+   return 0;
+}
+
+bool
+anv_gem_get_bit6_swizzle(int fd, uint32_t tiling)
+{
+   struct drm_gem_close close;
+   int ret;
+
+   struct drm_i915_gem_create gem_create;
+   VG_CLEAR(gem_create);
+   gem_create.size = 4096;
+
+   if (anv_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
+      assert(!"Failed to create GEM BO");
+      return false;
+   }
+
+   bool swizzled = false;
+
+   /* set_tiling overwrites the input on the error path, so we have to open
+    * code anv_ioctl.
+    */
+   struct drm_i915_gem_set_tiling set_tiling;
+   do {
+      VG_CLEAR(set_tiling);
+      set_tiling.handle = gem_create.handle;
+      set_tiling.tiling_mode = tiling;
+      set_tiling.stride = tiling == I915_TILING_X ? 512 : 128;
+
+      ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+
+   if (ret != 0) {
+      assert(!"Failed to set BO tiling");
+      goto close_and_return;
+   }
+
+   struct drm_i915_gem_get_tiling get_tiling;
+   VG_CLEAR(get_tiling);
+   get_tiling.handle = gem_create.handle;
+
+   if (anv_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
+      assert(!"Failed to get BO tiling");
+      goto close_and_return;
+   }
+
+   swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE;
+
+close_and_return:
+
+   VG_CLEAR(close);
+   close.handle = gem_create.handle;
+   anv_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
+
+   return swizzled;
+}
+
+int
+anv_gem_create_context(struct anv_device *device)
+{
+   struct drm_i915_gem_context_create create;
+   int ret;
+
+   VG_CLEAR(create);
+
+   ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
+   if (ret == -1)
+      return -1;
+
+   return create.ctx_id;
+}
+
+int
+anv_gem_destroy_context(struct anv_device *device, int context)
+{
+   struct drm_i915_gem_context_destroy destroy;
+
+   VG_CLEAR(destroy);
+   destroy.ctx_id = context;
+
+   return anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
+}
+
+int
+anv_gem_get_aperture(int fd, uint64_t *size)
+{
+   struct drm_i915_gem_get_aperture aperture;
+   int ret;
+
+   VG_CLEAR(aperture);
+   ret = anv_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
+   if (ret == -1)
+      return -1;
+
+   *size = aperture.aper_available_size;
+
+   return 0;
+}
+
+int
+anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
+{
+   struct drm_prime_handle args;
+   int ret;
+
+   VG_CLEAR(args);
+   args.handle = gem_handle;
+   args.flags = DRM_CLOEXEC;
+
+   ret = anv_ioctl(device->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args);
+   if (ret == -1)
+      return -1;
+
+   return args.fd;
+}
+
+uint32_t
+anv_gem_fd_to_handle(struct anv_device *device, int fd)
+{
+   struct drm_prime_handle args;
+   int ret;
+
+   VG_CLEAR(args);
+   args.fd = fd;
+
+   ret = anv_ioctl(device->fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &args);
+   if (ret == -1)
+      return 0;
+
+   return args.handle;
+}
diff --git a/src/vulkan/anv_gem_stubs.c b/src/vulkan/anv_gem_stubs.c

new file mode 100644 (file)

index 0000000..3204fef
--- /dev/null
+++ b/src/vulkan/anv_gem_stubs.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define _DEFAULT_SOURCE
+
+#include <linux/memfd.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+#include "anv_private.h"
+
+static inline int
+memfd_create(const char *name, unsigned int flags)
+{
+   return syscall(SYS_memfd_create, name, flags);
+}
+
+uint32_t
+anv_gem_create(struct anv_device *device, size_t size)
+{
+   int fd = memfd_create("fake bo", MFD_CLOEXEC);
+   if (fd == -1)
+      return 0;
+
+   assert(fd != 0);
+
+   if (ftruncate(fd, size) == -1)
+      return 0;
+
+   return fd;
+}
+
+void
+anv_gem_close(struct anv_device *device, uint32_t gem_handle)
+{
+   close(gem_handle);
+}
+
+void*
+anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
+             uint64_t offset, uint64_t size, uint32_t flags)
+{
+   /* Ignore flags, as they're specific to I915_GEM_MMAP. */
+   (void) flags;
+
+   return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               gem_handle, offset);
+}
+
+/* This is just a wrapper around munmap, but it also notifies valgrind that
+ * this map is no longer valid.  Pair this with anv_gem_mmap().
+ */
+void
+anv_gem_munmap(void *p, uint64_t size)
+{
+   munmap(p, size);
+}
+
+uint32_t
+anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
+{
+   return -1;
+}
+
+int
+anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns)
+{
+   return 0;
+}
+
+int
+anv_gem_execbuffer(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf)
+{
+   return 0;
+}
+
+int
+anv_gem_set_tiling(struct anv_device *device,
+                   uint32_t gem_handle, uint32_t stride, uint32_t tiling)
+{
+   return 0;
+}
+
+int
+anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle,
+                    uint32_t caching)
+{
+   return 0;
+}
+
+int
+anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
+                   uint32_t read_domains, uint32_t write_domain)
+{
+   return 0;
+}
+
+int
+anv_gem_get_param(int fd, uint32_t param)
+{
+   unreachable("Unused");
+}
+
+bool
+anv_gem_get_bit6_swizzle(int fd, uint32_t tiling)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_create_context(struct anv_device *device)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_destroy_context(struct anv_device *device, int context)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_get_aperture(int fd, uint64_t *size)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
+{
+   unreachable("Unused");
+}
+
+uint32_t
+anv_gem_fd_to_handle(struct anv_device *device, int fd)
+{
+   unreachable("Unused");
+}
diff --git a/src/vulkan/anv_gen_macros.h b/src/vulkan/anv_gen_macros.h

new file mode 100644 (file)

index 0000000..ef2ecd5
--- /dev/null
+++ b/src/vulkan/anv_gen_macros.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+/* Macros for handling per-gen compilation.
+ *
+ * The prefixing macros GENX() and genX() automatically prefix whatever you
+ * give them by GENX_ or genX_  where X is the gen number.
+ *
+ * You can declare a function to be used on some range of gens like this:
+ *
+ * GENX_FUNC(GEN7, GEN75) void
+ * genX(my_function_name)(args...)
+ * {
+ *    // Do stuff
+ * }
+ *
+ * If the file is compiled for any set of gens containing gen7 and gen75,
+ * the function will effectively only get compiled twice as
+ * gen7_my_function_nmae and gen75_my_function_name.  The function has to
+ * be compilable on all gens, but it will become a static inline that gets
+ * discarded by the compiler on all gens not in range.
+ *
+ * You can do pseudo-runtime checks in your function such as
+ *
+ * if (ANV_GEN > 8 || ANV_IS_HASWELL) {
+ *    // Do something
+ * }
+ *
+ * The contents of the if statement must be valid regardless of gen, but
+ * the if will get compiled away on everything except haswell.
+ *
+ * For places where you really do have a compile-time conflict, you can
+ * use preprocessor logic:
+ *
+ * #if (ANV_GEN > 8 || ANV_IS_HASWELL)
+ *    // Do something
+ * #endif
+ *
+ * However, it is strongly recommended that the former be used whenever
+ * possible.
+ */
+
+/* Base macro defined on the command line.  If we don't have this, we can't
+ * do anything.
+ */
+#ifdef ANV_GENx10
+
+/* Gen checking macros */
+#define ANV_GEN ((ANV_GENx10) / 10)
+#define ANV_IS_HASWELL ((ANV_GENx10) == 75)
+
+/* Prefixing macros */
+#if (ANV_GENx10 == 70)
+#  define GENX(X) GEN7_##X
+#  define genX(x) gen7_##x
+#elif (ANV_GENx10 == 75)
+#  define GENX(X) GEN75_##X
+#  define genX(x) gen75_##x
+#elif (ANV_GENx10 == 80)
+#  define GENX(X) GEN8_##X
+#  define genX(x) gen8_##x
+#elif (ANV_GENx10 == 90)
+#  define GENX(X) GEN9_##X
+#  define genX(x) gen9_##x
+#else
+#  error "Need to add prefixing macros for your gen"
+#endif
+
+/* Macros for comparing gens */
+#if (ANV_GENx10 >= 70)
+#define __ANV_GEN_GE_GEN7(T, F) T
+#else
+#define __ANV_GEN_GE_GEN7(T, F) F
+#endif
+
+#if (ANV_GENx10 <= 70)
+#define __ANV_GEN_LE_GEN7(T, F) T
+#else
+#define __ANV_GEN_LE_GEN7(T, F) F
+#endif
+
+#if (ANV_GENx10 >= 75)
+#define __ANV_GEN_GE_GEN75(T, F) T
+#else
+#define __ANV_GEN_GE_GEN75(T, F) F
+#endif
+
+#if (ANV_GENx10 <= 75)
+#define __ANV_GEN_LE_GEN75(T, F) T
+#else
+#define __ANV_GEN_LE_GEN75(T, F) F
+#endif
+
+#if (ANV_GENx10 >= 80)
+#define __ANV_GEN_GE_GEN8(T, F) T
+#else
+#define __ANV_GEN_GE_GEN8(T, F) F
+#endif
+
+#if (ANV_GENx10 <= 80)
+#define __ANV_GEN_LE_GEN8(T, F) T
+#else
+#define __ANV_GEN_LE_GEN8(T, F) F
+#endif
+
+#if (ANV_GENx10 >= 90)
+#define __ANV_GEN_GE_GEN9(T, F) T
+#else
+#define __ANV_GEN_GE_GEN9(T, F) F
+#endif
+
+#if (ANV_GENx10 <= 90)
+#define __ANV_GEN_LE_GEN9(T, F) T
+#else
+#define __ANV_GEN_LE_GEN9(T, F) F
+#endif
+
+#define __ANV_GEN_IN_RANGE(start, end, T, F) \
+   __ANV_GEN_GE_##start(__ANV_GEN_LE_##end(T, F), F)
+
+/* Declares a function as static inlind if it's not in range */
+#define GENX_FUNC(start, end) __ANV_GEN_IN_RANGE(start, end, , static inline)
+
+#endif /* ANV_GENx10 */
diff --git a/src/vulkan/anv_icd.json.in b/src/vulkan/anv_icd.json.in

new file mode 100644 (file)

index 0000000..cef6a30
--- /dev/null
+++ b/src/vulkan/anv_icd.json.in
@@ -0,0 +1,8 @@
+{
+    "file_format_version": "1.0.0",
+    "ICD": {
+        "library_path": "@abs_top_builddir@/lib/libvulkan.so.0.0.0",
+        "abi_versions": "0.210.1"
+    }
+}
+
diff --git a/src/vulkan/anv_image.c b/src/vulkan/anv_image.c

new file mode 100644 (file)

index 0000000..ba3b3b2
--- /dev/null
+++ b/src/vulkan/anv_image.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+/**
+ * The \a format argument is required and overrides any format found in struct
+ * anv_image_create_info. Exactly one bit must be set in \a aspect.
+ */
+static isl_surf_usage_flags_t
+choose_isl_surf_usage(const struct anv_image_create_info *info,
+                      VkImageAspectFlags aspect)
+{
+   const VkImageCreateInfo *vk_info = info->vk_info;
+   isl_surf_usage_flags_t isl_flags = 0;
+
+   /* FINISHME: Support aux surfaces */
+   isl_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
+   if (vk_info->usage & VK_IMAGE_USAGE_SAMPLED_BIT)
+      isl_flags |= ISL_SURF_USAGE_TEXTURE_BIT;
+
+   if (vk_info->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
+      isl_flags |= ISL_SURF_USAGE_TEXTURE_BIT;
+
+   if (vk_info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
+      isl_flags |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
+
+   if (vk_info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
+      isl_flags |= ISL_SURF_USAGE_CUBE_BIT;
+
+   if (vk_info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+      switch (aspect) {
+      default:
+         unreachable("bad VkImageAspect");
+      case VK_IMAGE_ASPECT_DEPTH_BIT:
+         isl_flags |= ISL_SURF_USAGE_DEPTH_BIT;
+         break;
+      case VK_IMAGE_ASPECT_STENCIL_BIT:
+         isl_flags |= ISL_SURF_USAGE_STENCIL_BIT;
+         break;
+      }
+   }
+
+   if (vk_info->usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+      /* Meta implements transfers by sampling from the source image. */
+      isl_flags |= ISL_SURF_USAGE_TEXTURE_BIT;
+   }
+
+   if (vk_info->usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
+      /* Meta implements transfers by rendering into the destination image. */
+      isl_flags |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
+   }
+
+   return isl_flags;
+}
+
+/**
+ * Exactly one bit must be set in \a aspect.
+ */
+static struct anv_surface *
+get_surface(struct anv_image *image, VkImageAspectFlags aspect)
+{
+   switch (aspect) {
+   default:
+      unreachable("bad VkImageAspect");
+   case VK_IMAGE_ASPECT_COLOR_BIT:
+      return &image->color_surface;
+   case VK_IMAGE_ASPECT_DEPTH_BIT:
+      return &image->depth_surface;
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      return &image->stencil_surface;
+   }
+}
+
+/**
+ * Initialize the anv_image::*_surface selected by \a aspect. Then update the
+ * image's memory requirements (that is, the image's size and alignment).
+ *
+ * Exactly one bit must be set in \a aspect.
+ */
+static VkResult
+make_surface(const struct anv_device *dev,
+             struct anv_image *image,
+             const struct anv_image_create_info *anv_info,
+             VkImageAspectFlags aspect)
+{
+   const VkImageCreateInfo *vk_info = anv_info->vk_info;
+   bool ok UNUSED;
+
+   static const enum isl_surf_dim vk_to_isl_surf_dim[] = {
+      [VK_IMAGE_TYPE_1D] = ISL_SURF_DIM_1D,
+      [VK_IMAGE_TYPE_2D] = ISL_SURF_DIM_2D,
+      [VK_IMAGE_TYPE_3D] = ISL_SURF_DIM_3D,
+   };
+
+   isl_tiling_flags_t tiling_flags = anv_info->isl_tiling_flags;
+   if (vk_info->tiling == VK_IMAGE_TILING_LINEAR)
+      tiling_flags &= ISL_TILING_LINEAR_BIT;
+
+   struct anv_surface *anv_surf = get_surface(image, aspect);
+
+   ok = isl_surf_init(&dev->isl_dev, &anv_surf->isl,
+      .dim = vk_to_isl_surf_dim[vk_info->imageType],
+      .format = anv_get_isl_format(vk_info->format, aspect, vk_info->tiling),
+      .width = vk_info->extent.width,
+      .height = vk_info->extent.height,
+      .depth = vk_info->extent.depth,
+      .levels = vk_info->mipLevels,
+      .array_len = vk_info->arrayLayers,
+      .samples = vk_info->samples,
+      .min_alignment = 0,
+      .min_pitch = 0,
+      .usage = choose_isl_surf_usage(anv_info, aspect),
+      .tiling_flags = tiling_flags);
+
+   /* isl_surf_init() will fail only if provided invalid input. Invalid input
+    * is illegal in Vulkan.
+    */
+   assert(ok);
+
+   anv_surf->offset = align_u32(image->size, anv_surf->isl.alignment);
+   image->size = anv_surf->offset + anv_surf->isl.size;
+   image->alignment = MAX(image->alignment, anv_surf->isl.alignment);
+
+   return VK_SUCCESS;
+}
+
+static VkImageUsageFlags
+anv_image_get_full_usage(const VkImageCreateInfo *info)
+{
+   VkImageUsageFlags usage = info->usage;
+
+   if (usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+      /* Meta will transfer from the image by binding it as a texture. */
+      usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+   }
+
+   if (usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
+      /* Meta will transfer to the image by binding it as a color attachment,
+       * even if the image format is not a color format.
+       */
+      usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+   }
+
+   return usage;
+}
+
+VkResult
+anv_image_create(VkDevice _device,
+                 const struct anv_image_create_info *create_info,
+                 const VkAllocationCallbacks* alloc,
+                 VkImage *pImage)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   const VkImageCreateInfo *pCreateInfo = create_info->vk_info;
+   struct anv_image *image = NULL;
+   VkResult r;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO);
+
+   anv_assert(pCreateInfo->mipLevels > 0);
+   anv_assert(pCreateInfo->arrayLayers > 0);
+   anv_assert(pCreateInfo->samples == VK_SAMPLE_COUNT_1_BIT);
+   anv_assert(pCreateInfo->extent.width > 0);
+   anv_assert(pCreateInfo->extent.height > 0);
+   anv_assert(pCreateInfo->extent.depth > 0);
+
+   image = anv_alloc2(&device->alloc, alloc, sizeof(*image), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!image)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   memset(image, 0, sizeof(*image));
+   image->type = pCreateInfo->imageType;
+   image->extent = pCreateInfo->extent;
+   image->vk_format = pCreateInfo->format;
+   image->format = anv_format_for_vk_format(pCreateInfo->format);
+   image->levels = pCreateInfo->mipLevels;
+   image->array_size = pCreateInfo->arrayLayers;
+   image->usage = anv_image_get_full_usage(pCreateInfo);
+   image->tiling = pCreateInfo->tiling;
+
+   if (image->usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
+      image->needs_nonrt_surface_state = true;
+   }
+
+   if (image->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+      image->needs_color_rt_surface_state = true;
+   }
+
+   if (image->usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+      image->needs_storage_surface_state = true;
+   }
+
+   if (likely(anv_format_is_color(image->format))) {
+      r = make_surface(device, image, create_info,
+                       VK_IMAGE_ASPECT_COLOR_BIT);
+      if (r != VK_SUCCESS)
+         goto fail;
+   } else {
+      if (image->format->depth_format) {
+         r = make_surface(device, image, create_info,
+                          VK_IMAGE_ASPECT_DEPTH_BIT);
+         if (r != VK_SUCCESS)
+            goto fail;
+      }
+
+      if (image->format->has_stencil) {
+         r = make_surface(device, image, create_info,
+                          VK_IMAGE_ASPECT_STENCIL_BIT);
+         if (r != VK_SUCCESS)
+            goto fail;
+      }
+   }
+
+   *pImage = anv_image_to_handle(image);
+
+   return VK_SUCCESS;
+
+fail:
+   if (image)
+      anv_free2(&device->alloc, alloc, image);
+
+   return r;
+}
+
+VkResult
+anv_CreateImage(VkDevice device,
+                const VkImageCreateInfo *pCreateInfo,
+                const VkAllocationCallbacks *pAllocator,
+                VkImage *pImage)
+{
+   return anv_image_create(device,
+      &(struct anv_image_create_info) {
+         .vk_info = pCreateInfo,
+         .isl_tiling_flags = ISL_TILING_ANY_MASK,
+      },
+      pAllocator,
+      pImage);
+}
+
+void
+anv_DestroyImage(VkDevice _device, VkImage _image,
+                 const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   anv_free2(&device->alloc, pAllocator, anv_image_from_handle(_image));
+}
+
+static void
+anv_surface_get_subresource_layout(struct anv_image *image,
+                                   struct anv_surface *surface,
+                                   const VkImageSubresource *subresource,
+                                   VkSubresourceLayout *layout)
+{
+   /* If we are on a non-zero mip level or array slice, we need to
+    * calculate a real offset.
+    */
+   anv_assert(subresource->mipLevel == 0);
+   anv_assert(subresource->arrayLayer == 0);
+
+   layout->offset = surface->offset;
+   layout->rowPitch = surface->isl.row_pitch;
+   layout->depthPitch = isl_surf_get_array_pitch(&surface->isl);
+   layout->arrayPitch = isl_surf_get_array_pitch(&surface->isl);
+   layout->size = surface->isl.size;
+}
+
+void anv_GetImageSubresourceLayout(
+    VkDevice                                    device,
+    VkImage                                     _image,
+    const VkImageSubresource*                   pSubresource,
+    VkSubresourceLayout*                        pLayout)
+{
+   ANV_FROM_HANDLE(anv_image, image, _image);
+
+   assert(__builtin_popcount(pSubresource->aspectMask) == 1);
+
+   switch (pSubresource->aspectMask) {
+   case VK_IMAGE_ASPECT_COLOR_BIT:
+      anv_surface_get_subresource_layout(image, &image->color_surface,
+                                         pSubresource, pLayout);
+      break;
+   case VK_IMAGE_ASPECT_DEPTH_BIT:
+      anv_surface_get_subresource_layout(image, &image->depth_surface,
+                                         pSubresource, pLayout);
+      break;
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      anv_surface_get_subresource_layout(image, &image->stencil_surface,
+                                         pSubresource, pLayout);
+      break;
+   default:
+      assert(!"Invalid image aspect");
+   }
+}
+
+VkResult
+anv_validate_CreateImageView(VkDevice _device,
+                             const VkImageViewCreateInfo *pCreateInfo,
+                             const VkAllocationCallbacks *pAllocator,
+                             VkImageView *pView)
+{
+   ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
+   const VkImageSubresourceRange *subresource;
+   const struct anv_format *view_format_info;
+
+   /* Validate structure type before dereferencing it. */
+   assert(pCreateInfo);
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO);
+   subresource = &pCreateInfo->subresourceRange;
+
+   /* Validate viewType is in range before using it. */
+   assert(pCreateInfo->viewType >= VK_IMAGE_VIEW_TYPE_BEGIN_RANGE);
+   assert(pCreateInfo->viewType <= VK_IMAGE_VIEW_TYPE_END_RANGE);
+
+   /* Validate format is in range before using it. */
+   assert(pCreateInfo->format >= VK_FORMAT_BEGIN_RANGE);
+   assert(pCreateInfo->format <= VK_FORMAT_END_RANGE);
+   view_format_info = anv_format_for_vk_format(pCreateInfo->format);
+
+   /* Validate channel swizzles. */
+   assert(pCreateInfo->components.r >= VK_COMPONENT_SWIZZLE_BEGIN_RANGE);
+   assert(pCreateInfo->components.r <= VK_COMPONENT_SWIZZLE_END_RANGE);
+   assert(pCreateInfo->components.g >= VK_COMPONENT_SWIZZLE_BEGIN_RANGE);
+   assert(pCreateInfo->components.g <= VK_COMPONENT_SWIZZLE_END_RANGE);
+   assert(pCreateInfo->components.b >= VK_COMPONENT_SWIZZLE_BEGIN_RANGE);
+   assert(pCreateInfo->components.b <= VK_COMPONENT_SWIZZLE_END_RANGE);
+   assert(pCreateInfo->components.a >= VK_COMPONENT_SWIZZLE_BEGIN_RANGE);
+   assert(pCreateInfo->components.a <= VK_COMPONENT_SWIZZLE_END_RANGE);
+
+   /* Validate subresource. */
+   assert(subresource->aspectMask != 0);
+   assert(subresource->levelCount > 0);
+   assert(subresource->layerCount > 0);
+   assert(subresource->baseMipLevel < image->levels);
+   assert(subresource->baseMipLevel + subresource->levelCount <= image->levels);
+   assert(subresource->baseArrayLayer < image->array_size);
+   assert(subresource->baseArrayLayer + subresource->layerCount <= image->array_size);
+   assert(pView);
+
+   const VkImageAspectFlags ds_flags = VK_IMAGE_ASPECT_DEPTH_BIT
+                                     | VK_IMAGE_ASPECT_STENCIL_BIT;
+
+   /* Validate format. */
+   if (subresource->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
+      assert(subresource->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+      assert(!image->format->depth_format);
+      assert(!image->format->has_stencil);
+      assert(!view_format_info->depth_format);
+      assert(!view_format_info->has_stencil);
+      assert(view_format_info->isl_layout->bs ==
+             image->format->isl_layout->bs);
+   } else if (subresource->aspectMask & ds_flags) {
+      assert((subresource->aspectMask & ~ds_flags) == 0);
+
+      if (subresource->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+         assert(image->format->depth_format);
+         assert(view_format_info->depth_format);
+         assert(view_format_info->isl_layout->bs ==
+                image->format->isl_layout->bs);
+      }
+
+      if (subresource->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+         /* FINISHME: Is it legal to have an R8 view of S8? */
+         assert(image->format->has_stencil);
+         assert(view_format_info->has_stencil);
+      }
+   } else {
+      assert(!"bad VkImageSubresourceRange::aspectFlags");
+   }
+
+   return anv_CreateImageView(_device, pCreateInfo, pAllocator, pView);
+}
+
+void
+anv_image_view_init(struct anv_image_view *iview,
+                    struct anv_device *device,
+                    const VkImageViewCreateInfo* pCreateInfo,
+                    struct anv_cmd_buffer *cmd_buffer)
+{
+   ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
+   const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
+
+   assert(range->layerCount > 0);
+   assert(range->baseMipLevel < image->levels);
+   assert(image->usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
+                          VK_IMAGE_USAGE_STORAGE_BIT |
+                          VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                          VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT));
+
+   switch (image->type) {
+   default:
+      unreachable("bad VkImageType");
+   case VK_IMAGE_TYPE_1D:
+   case VK_IMAGE_TYPE_2D:
+      assert(range->baseArrayLayer + range->layerCount - 1 <= image->array_size);
+      break;
+   case VK_IMAGE_TYPE_3D:
+      assert(range->baseArrayLayer + range->layerCount - 1
+             <= anv_minify(image->extent.depth, range->baseMipLevel));
+      break;
+   }
+
+   struct anv_surface *surface =
+      anv_image_get_surface_for_aspect_mask(image, range->aspectMask);
+
+   iview->image = image;
+   iview->bo = image->bo;
+   iview->offset = image->offset + surface->offset;
+
+   iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask;
+   iview->vk_format = pCreateInfo->format;
+   iview->format = anv_get_isl_format(pCreateInfo->format, iview->aspect_mask,
+                                      image->tiling);
+
+   iview->extent = (VkExtent3D) {
+      .width = anv_minify(image->extent.width, range->baseMipLevel),
+      .height = anv_minify(image->extent.height, range->baseMipLevel),
+      .depth = anv_minify(image->extent.depth, range->baseMipLevel),
+   };
+
+   switch (device->info.gen) {
+   case 7:
+      if (device->info.is_haswell)
+         gen75_image_view_init(iview, device, pCreateInfo, cmd_buffer);
+      else
+         gen7_image_view_init(iview, device, pCreateInfo, cmd_buffer);
+      break;
+   case 8:
+      gen8_image_view_init(iview, device, pCreateInfo, cmd_buffer);
+      break;
+   case 9:
+      gen9_image_view_init(iview, device, pCreateInfo, cmd_buffer);
+      break;
+   default:
+      unreachable("unsupported gen\n");
+   }
+}
+
+VkResult
+anv_CreateImageView(VkDevice _device,
+                    const VkImageViewCreateInfo *pCreateInfo,
+                    const VkAllocationCallbacks *pAllocator,
+                    VkImageView *pView)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_image_view *view;
+
+   view = anv_alloc2(&device->alloc, pAllocator, sizeof(*view), 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (view == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   anv_image_view_init(view, device, pCreateInfo, NULL);
+
+   *pView = anv_image_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyImageView(VkDevice _device, VkImageView _iview,
+                     const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image_view, iview, _iview);
+
+   if (iview->image->needs_color_rt_surface_state) {
+      anv_state_pool_free(&device->surface_state_pool,
+                          iview->color_rt_surface_state);
+   }
+
+   if (iview->image->needs_nonrt_surface_state) {
+      anv_state_pool_free(&device->surface_state_pool,
+                          iview->nonrt_surface_state);
+   }
+
+   if (iview->image->needs_storage_surface_state) {
+      anv_state_pool_free(&device->surface_state_pool,
+                          iview->storage_surface_state);
+   }
+
+   anv_free2(&device->alloc, pAllocator, iview);
+}
+
+VkResult
+anv_CreateBufferView(VkDevice _device,
+                     const VkBufferViewCreateInfo *pCreateInfo,
+                     const VkAllocationCallbacks *pAllocator,
+                     VkBufferView *pView)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
+   struct anv_buffer_view *view;
+
+   view = anv_alloc2(&device->alloc, pAllocator, sizeof(*view), 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!view)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   const struct anv_format *format =
+      anv_format_for_vk_format(pCreateInfo->format);
+
+   view->format = format->surface_format;
+   view->bo = buffer->bo;
+   view->offset = buffer->offset + pCreateInfo->offset;
+   view->range = pCreateInfo->range == VK_WHOLE_SIZE ?
+                 buffer->size - view->offset : pCreateInfo->range;
+
+   if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) {
+      view->surface_state =
+         anv_state_pool_alloc(&device->surface_state_pool, 64, 64);
+
+      anv_fill_buffer_surface_state(device, view->surface_state.map,
+                                    view->format,
+                                    view->offset, view->range,
+                                    format->isl_layout->bs);
+   } else {
+      view->surface_state = (struct anv_state){ 0 };
+   }
+
+   if (buffer->usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) {
+      view->storage_surface_state =
+         anv_state_pool_alloc(&device->surface_state_pool, 64, 64);
+
+      enum isl_format storage_format =
+         isl_lower_storage_image_format(&device->isl_dev, view->format);
+
+      anv_fill_buffer_surface_state(device, view->storage_surface_state.map,
+                                    storage_format,
+                                    view->offset, view->range,
+                                    format->isl_layout->bs);
+   } else {
+      view->storage_surface_state = (struct anv_state){ 0 };
+   }
+
+   *pView = anv_buffer_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyBufferView(VkDevice _device, VkBufferView bufferView,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_buffer_view, view, bufferView);
+
+   if (view->surface_state.alloc_size > 0)
+      anv_state_pool_free(&device->surface_state_pool,
+                          view->surface_state);
+
+   if (view->storage_surface_state.alloc_size > 0)
+      anv_state_pool_free(&device->surface_state_pool,
+                          view->storage_surface_state);
+
+   anv_free2(&device->alloc, pAllocator, view);
+}
+
+struct anv_surface *
+anv_image_get_surface_for_aspect_mask(struct anv_image *image, VkImageAspectFlags aspect_mask)
+{
+   switch (aspect_mask) {
+   case VK_IMAGE_ASPECT_COLOR_BIT:
+      /* Dragons will eat you.
+       *
+       * Meta attaches all destination surfaces as color render targets. Guess
+       * what surface the Meta Dragons really want.
+       */
+      if (image->format->depth_format && image->format->has_stencil) {
+         anv_finishme("combined depth stencil formats");
+         return &image->depth_surface;
+      } else if (image->format->depth_format) {
+         return &image->depth_surface;
+      } else if (image->format->has_stencil) {
+         return &image->stencil_surface;
+      } else {
+         return &image->color_surface;
+      }
+      break;
+   case VK_IMAGE_ASPECT_DEPTH_BIT:
+      assert(image->format->depth_format);
+      return &image->depth_surface;
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      assert(image->format->has_stencil);
+      return &image->stencil_surface;
+   case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT:
+      if (image->format->depth_format && image->format->has_stencil) {
+         /* FINISHME: The Vulkan spec (git a511ba2) requires support for combined
+          * depth stencil formats. Specifically, it states:
+          *
+          *    At least one of ename:VK_FORMAT_D24_UNORM_S8_UINT or
+          *    ename:VK_FORMAT_D32_SFLOAT_S8_UINT must be supported.
+          */
+         anv_finishme("combined depthstencil aspect");
+         return &image->depth_surface;
+      } else if (image->format->depth_format) {
+         return &image->depth_surface;
+      } else if (image->format->has_stencil) {
+         return &image->stencil_surface;
+      }
+      /* fallthrough */
+    default:
+       unreachable("image does not have aspect");
+       return NULL;
+   }
+}
+
+void
+anv_image_view_fill_image_param(struct anv_device *device,
+                                struct anv_image_view *view,
+                                struct brw_image_param *param)
+{
+   memset(param, 0, sizeof *param);
+   anv_finishme("Actually fill out brw_image_param");
+}
+
+void
+anv_buffer_view_fill_image_param(struct anv_device *device,
+                                 struct anv_buffer_view *view,
+                                 struct brw_image_param *param)
+{
+   /* Set the swizzling shifts to all-ones to effectively disable swizzling --
+    * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
+    * detailed explanation of these parameters.
+    */
+   param->swizzling[0] = 0xff;
+   param->swizzling[1] = 0xff;
+
+   param->stride[0] = isl_format_layouts[view->format].bs;
+   param->size[0] = view->range / param->stride[0];
+}
diff --git a/src/vulkan/anv_intel.c b/src/vulkan/anv_intel.c

new file mode 100644 (file)

index 0000000..d95d9af
--- /dev/null
+++ b/src/vulkan/anv_intel.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+VkResult anv_CreateDmaBufImageINTEL(
+    VkDevice                                    _device,
+    const VkDmaBufImageCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDeviceMemory*                             pMem,
+    VkImage*                                    pImage)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_device_memory *mem;
+   struct anv_image *image;
+   VkResult result;
+   VkImage image_h;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DMA_BUF_IMAGE_CREATE_INFO_INTEL);
+
+   mem = anv_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (mem == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   mem->bo.gem_handle = anv_gem_fd_to_handle(device, pCreateInfo->fd);
+   if (!mem->bo.gem_handle) {
+      result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      goto fail;
+   }
+
+   mem->bo.map = NULL;
+   mem->bo.index = 0;
+   mem->bo.offset = 0;
+   mem->bo.size = pCreateInfo->strideInBytes * pCreateInfo->extent.height;
+
+   anv_image_create(_device,
+      &(struct anv_image_create_info) {
+         .isl_tiling_flags = ISL_TILING_X_BIT,
+         .stride = pCreateInfo->strideInBytes,
+         .vk_info =
+      &(VkImageCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+         .imageType = VK_IMAGE_TYPE_2D,
+         .format = pCreateInfo->format,
+         .extent = pCreateInfo->extent,
+         .mipLevels = 1,
+         .arrayLayers = 1,
+         .samples = 1,
+         /* FIXME: Need a way to use X tiling to allow scanout */
+         .tiling = VK_IMAGE_TILING_OPTIMAL,
+         .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+         .flags = 0,
+      }},
+      pAllocator, &image_h);
+
+   image = anv_image_from_handle(image_h);
+   image->bo = &mem->bo;
+   image->offset = 0;
+
+   assert(image->extent.width > 0);
+   assert(image->extent.height > 0);
+   assert(image->extent.depth == 1);
+
+   *pMem = anv_device_memory_to_handle(mem);
+   *pImage = anv_image_to_handle(image);
+
+   return VK_SUCCESS;
+
+ fail:
+   anv_free2(&device->alloc, pAllocator, mem);
+
+   return result;
+}
diff --git a/src/vulkan/anv_meta.c b/src/vulkan/anv_meta.c

new file mode 100644 (file)

index 0000000..9e13299
--- /dev/null
+++ b/src/vulkan/anv_meta.c
@@ -0,0 +1,1518 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_meta.h"
+#include "anv_meta_clear.h"
+#include "anv_private.h"
+#include "glsl/nir/nir_builder.h"
+
+struct anv_render_pass anv_meta_dummy_renderpass = {0};
+
+static nir_shader *
+build_nir_vertex_shader(bool attr_flat)
+{
+   nir_builder b;
+
+   const struct glsl_type *vertex_type = glsl_vec4_type();
+
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, NULL);
+   b.shader->info.name = ralloc_strdup(b.shader, "meta_blit_vs");
+
+   nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                              vertex_type, "a_pos");
+   pos_in->data.location = VERT_ATTRIB_GENERIC0;
+   nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                               vertex_type, "gl_Position");
+   pos_out->data.location = VARYING_SLOT_POS;
+   nir_copy_var(&b, pos_out, pos_in);
+
+   /* Add one more pass-through attribute.  For clear shaders, this is used
+    * to store the color and for blit shaders it's the texture coordinate.
+    */
+   const struct glsl_type *attr_type = glsl_vec4_type();
+   nir_variable *attr_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                               attr_type, "a_attr");
+   attr_in->data.location = VERT_ATTRIB_GENERIC1;
+   nir_variable *attr_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                                attr_type, "v_attr");
+   attr_out->data.location = VARYING_SLOT_VAR0;
+   attr_out->data.interpolation = attr_flat ? INTERP_QUALIFIER_FLAT :
+                                              INTERP_QUALIFIER_SMOOTH;
+   nir_copy_var(&b, attr_out, attr_in);
+
+   return b.shader;
+}
+
+static nir_shader *
+build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim)
+{
+   nir_builder b;
+
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+   b.shader->info.name = ralloc_strdup(b.shader, "meta_blit_fs");
+
+   const struct glsl_type *color_type = glsl_vec4_type();
+
+   nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                                  glsl_vec4_type(), "v_attr");
+   tex_pos_in->data.location = VARYING_SLOT_VAR0;
+
+   const struct glsl_type *sampler_type =
+      glsl_sampler_type(tex_dim, false, false, glsl_get_base_type(color_type));
+   nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform,
+                                               sampler_type, "s_tex");
+   sampler->data.descriptor_set = 0;
+   sampler->data.binding = 0;
+
+   nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
+   tex->sampler_dim = tex_dim;
+   tex->op = nir_texop_tex;
+   tex->src[0].src_type = nir_tex_src_coord;
+   tex->src[0].src = nir_src_for_ssa(nir_load_var(&b, tex_pos_in));
+   tex->dest_type = nir_type_float; /* TODO */
+
+   if (tex_dim != GLSL_SAMPLER_DIM_3D)
+      tex->is_array = true;
+
+   tex->coord_components = 3;
+
+   tex->sampler = nir_deref_var_create(tex, sampler);
+
+   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, "tex");
+   nir_builder_instr_insert(&b, &tex->instr);
+
+   nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                                 color_type, "f_color");
+   color_out->data.location = FRAG_RESULT_DATA0;
+   nir_store_var(&b, color_out, &tex->dest.ssa, 4);
+
+   return b.shader;
+}
+
+void
+anv_meta_save(struct anv_meta_saved_state *state,
+              const struct anv_cmd_buffer *cmd_buffer,
+              uint32_t dynamic_mask)
+{
+   state->old_pipeline = cmd_buffer->state.pipeline;
+   state->old_descriptor_set0 = cmd_buffer->state.descriptors[0];
+   memcpy(state->old_vertex_bindings, cmd_buffer->state.vertex_bindings,
+          sizeof(state->old_vertex_bindings));
+
+   state->dynamic_mask = dynamic_mask;
+   anv_dynamic_state_copy(&state->dynamic, &cmd_buffer->state.dynamic,
+                          dynamic_mask);
+}
+
+void
+anv_meta_restore(const struct anv_meta_saved_state *state,
+                 struct anv_cmd_buffer *cmd_buffer)
+{
+   cmd_buffer->state.pipeline = state->old_pipeline;
+   cmd_buffer->state.descriptors[0] = state->old_descriptor_set0;
+   memcpy(cmd_buffer->state.vertex_bindings, state->old_vertex_bindings,
+          sizeof(state->old_vertex_bindings));
+
+   cmd_buffer->state.vb_dirty |= (1 << ANV_META_VERTEX_BINDING_COUNT) - 1;
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_PIPELINE;
+   cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_VERTEX_BIT;
+
+   anv_dynamic_state_copy(&cmd_buffer->state.dynamic, &state->dynamic,
+                          state->dynamic_mask);
+   cmd_buffer->state.dirty |= state->dynamic_mask;
+
+   /* Since we've used the pipeline with the VS disabled, set
+    * need_query_wa. See CmdBeginQuery.
+    */
+   cmd_buffer->state.need_query_wa = true;
+}
+
+VkImageViewType
+anv_meta_get_view_type(const struct anv_image *image)
+{
+   switch (image->type) {
+   case VK_IMAGE_TYPE_1D: return VK_IMAGE_VIEW_TYPE_1D;
+   case VK_IMAGE_TYPE_2D: return VK_IMAGE_VIEW_TYPE_2D;
+   case VK_IMAGE_TYPE_3D: return VK_IMAGE_VIEW_TYPE_3D;
+   default:
+      unreachable("bad VkImageViewType");
+   }
+}
+
+static uint32_t
+meta_blit_get_dest_view_base_array_slice(const struct anv_image *dest_image,
+                                         const VkImageSubresourceLayers *dest_subresource,
+                                         const VkOffset3D *dest_offset)
+{
+   switch (dest_image->type) {
+   case VK_IMAGE_TYPE_1D:
+   case VK_IMAGE_TYPE_2D:
+      return dest_subresource->baseArrayLayer;
+   case VK_IMAGE_TYPE_3D:
+      /* HACK: Vulkan does not allow attaching a 3D image to a framebuffer,
+       * but meta does it anyway. When doing so, we translate the
+       * destination's z offset into an array offset.
+       */
+      return dest_offset->z;
+   default:
+      assert(!"bad VkImageType");
+      return 0;
+   }
+}
+
+static VkResult
+anv_device_init_meta_blit_state(struct anv_device *device)
+{
+   VkResult result;
+
+   result = anv_CreateRenderPass(anv_device_to_handle(device),
+      &(VkRenderPassCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+         .attachmentCount = 1,
+         .pAttachments = &(VkAttachmentDescription) {
+            .format = VK_FORMAT_UNDEFINED, /* Our shaders don't care */
+            .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+            .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+            .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
+         },
+         .subpassCount = 1,
+         .pSubpasses = &(VkSubpassDescription) {
+            .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+            .inputAttachmentCount = 0,
+            .colorAttachmentCount = 1,
+            .pColorAttachments = &(VkAttachmentReference) {
+               .attachment = 0,
+               .layout = VK_IMAGE_LAYOUT_GENERAL,
+            },
+            .pResolveAttachments = NULL,
+            .pDepthStencilAttachment = &(VkAttachmentReference) {
+               .attachment = VK_ATTACHMENT_UNUSED,
+               .layout = VK_IMAGE_LAYOUT_GENERAL,
+            },
+            .preserveAttachmentCount = 1,
+            .pPreserveAttachments = (uint32_t[]) { 0 },
+         },
+         .dependencyCount = 0,
+      }, &device->meta_state.alloc, &device->meta_state.blit.render_pass);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   /* We don't use a vertex shader for clearing, but instead build and pass
+    * the VUEs directly to the rasterization backend.  However, we do need
+    * to provide GLSL source for the vertex shader so that the compiler
+    * does not dead-code our inputs.
+    */
+   struct anv_shader_module vs = {
+      .nir = build_nir_vertex_shader(false),
+   };
+
+   struct anv_shader_module fs_1d = {
+      .nir = build_nir_copy_fragment_shader(GLSL_SAMPLER_DIM_1D),
+   };
+
+   struct anv_shader_module fs_2d = {
+      .nir = build_nir_copy_fragment_shader(GLSL_SAMPLER_DIM_2D),
+   };
+
+   struct anv_shader_module fs_3d = {
+      .nir = build_nir_copy_fragment_shader(GLSL_SAMPLER_DIM_3D),
+   };
+
+   VkPipelineVertexInputStateCreateInfo vi_create_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+      .vertexBindingDescriptionCount = 2,
+      .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
+         {
+            .binding = 0,
+            .stride = 0,
+            .inputRate = VK_VERTEX_INPUT_RATE_VERTEX
+         },
+         {
+            .binding = 1,
+            .stride = 5 * sizeof(float),
+            .inputRate = VK_VERTEX_INPUT_RATE_VERTEX
+         },
+      },
+      .vertexAttributeDescriptionCount = 3,
+      .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
+         {
+            /* VUE Header */
+            .location = 0,
+            .binding = 0,
+            .format = VK_FORMAT_R32G32B32A32_UINT,
+            .offset = 0
+         },
+         {
+            /* Position */
+            .location = 1,
+            .binding = 1,
+            .format = VK_FORMAT_R32G32_SFLOAT,
+            .offset = 0
+         },
+         {
+            /* Texture Coordinate */
+            .location = 2,
+            .binding = 1,
+            .format = VK_FORMAT_R32G32B32_SFLOAT,
+            .offset = 8
+         }
+      }
+   };
+
+   VkDescriptorSetLayoutCreateInfo ds_layout_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .bindingCount = 1,
+      .pBindings = (VkDescriptorSetLayoutBinding[]) {
+         {
+            .binding = 0,
+            .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .descriptorCount = 1,
+            .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+            .pImmutableSamplers = NULL
+         },
+      }
+   };
+   result = anv_CreateDescriptorSetLayout(anv_device_to_handle(device),
+                                          &ds_layout_info,
+                                          &device->meta_state.alloc,
+                                          &device->meta_state.blit.ds_layout);
+   if (result != VK_SUCCESS)
+      goto fail_render_pass;
+
+   result = anv_CreatePipelineLayout(anv_device_to_handle(device),
+      &(VkPipelineLayoutCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+         .setLayoutCount = 1,
+         .pSetLayouts = &device->meta_state.blit.ds_layout,
+      },
+      &device->meta_state.alloc, &device->meta_state.blit.pipeline_layout);
+   if (result != VK_SUCCESS)
+      goto fail_descriptor_set_layout;
+
+   VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = {
+      {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+         .stage = VK_SHADER_STAGE_VERTEX_BIT,
+         .module = anv_shader_module_to_handle(&vs),
+         .pName = "main",
+         .pSpecializationInfo = NULL
+      }, {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+         .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+         .module = VK_NULL_HANDLE, /* TEMPLATE VALUE! FILL ME IN! */
+         .pName = "main",
+         .pSpecializationInfo = NULL
+      },
+   };
+
+   const VkGraphicsPipelineCreateInfo vk_pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+      .stageCount = ARRAY_SIZE(pipeline_shader_stages),
+      .pStages = pipeline_shader_stages,
+      .pVertexInputState = &vi_create_info,
+      .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+         .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+         .primitiveRestartEnable = false,
+      },
+      .pViewportState = &(VkPipelineViewportStateCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+         .viewportCount = 1,
+         .scissorCount = 1,
+      },
+      .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+         .rasterizerDiscardEnable = false,
+         .polygonMode = VK_POLYGON_MODE_FILL,
+         .cullMode = VK_CULL_MODE_NONE,
+         .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE
+      },
+      .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+         .rasterizationSamples = 1,
+         .sampleShadingEnable = false,
+         .pSampleMask = (VkSampleMask[]) { UINT32_MAX },
+      },
+      .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+         .attachmentCount = 1,
+         .pAttachments = (VkPipelineColorBlendAttachmentState []) {
+            { .colorWriteMask =
+                 VK_COLOR_COMPONENT_A_BIT |
+                 VK_COLOR_COMPONENT_R_BIT |
+                 VK_COLOR_COMPONENT_G_BIT |
+                 VK_COLOR_COMPONENT_B_BIT },
+         }
+      },
+      .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+         .dynamicStateCount = 9,
+         .pDynamicStates = (VkDynamicState[]) {
+            VK_DYNAMIC_STATE_VIEWPORT,
+            VK_DYNAMIC_STATE_SCISSOR,
+            VK_DYNAMIC_STATE_LINE_WIDTH,
+            VK_DYNAMIC_STATE_DEPTH_BIAS,
+            VK_DYNAMIC_STATE_BLEND_CONSTANTS,
+            VK_DYNAMIC_STATE_DEPTH_BOUNDS,
+            VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
+            VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
+            VK_DYNAMIC_STATE_STENCIL_REFERENCE,
+         },
+      },
+      .flags = 0,
+      .layout = device->meta_state.blit.pipeline_layout,
+      .renderPass = device->meta_state.blit.render_pass,
+      .subpass = 0,
+   };
+
+   const struct anv_graphics_pipeline_create_info anv_pipeline_info = {
+      .color_attachment_count = -1,
+      .use_repclear = false,
+      .disable_viewport = true,
+      .disable_scissor = true,
+      .disable_vs = true,
+      .use_rectlist = true
+   };
+
+   pipeline_shader_stages[1].module = anv_shader_module_to_handle(&fs_1d);
+   result = anv_graphics_pipeline_create(anv_device_to_handle(device),
+      VK_NULL_HANDLE,
+      &vk_pipeline_info, &anv_pipeline_info,
+      &device->meta_state.alloc, &device->meta_state.blit.pipeline_1d_src);
+   if (result != VK_SUCCESS)
+      goto fail_pipeline_layout;
+
+   pipeline_shader_stages[1].module = anv_shader_module_to_handle(&fs_2d);
+   result = anv_graphics_pipeline_create(anv_device_to_handle(device),
+      VK_NULL_HANDLE,
+      &vk_pipeline_info, &anv_pipeline_info,
+      &device->meta_state.alloc, &device->meta_state.blit.pipeline_2d_src);
+   if (result != VK_SUCCESS)
+      goto fail_pipeline_1d;
+
+   pipeline_shader_stages[1].module = anv_shader_module_to_handle(&fs_3d);
+   result = anv_graphics_pipeline_create(anv_device_to_handle(device),
+      VK_NULL_HANDLE,
+      &vk_pipeline_info, &anv_pipeline_info,
+      &device->meta_state.alloc, &device->meta_state.blit.pipeline_3d_src);
+   if (result != VK_SUCCESS)
+      goto fail_pipeline_2d;
+
+   ralloc_free(vs.nir);
+   ralloc_free(fs_1d.nir);
+   ralloc_free(fs_2d.nir);
+   ralloc_free(fs_3d.nir);
+
+   return VK_SUCCESS;
+
+ fail_pipeline_2d:
+   anv_DestroyPipeline(anv_device_to_handle(device),
+                       device->meta_state.blit.pipeline_2d_src,
+                       &device->meta_state.alloc);
+
+ fail_pipeline_1d:
+   anv_DestroyPipeline(anv_device_to_handle(device),
+                       device->meta_state.blit.pipeline_1d_src,
+                       &device->meta_state.alloc);
+
+ fail_pipeline_layout:
+   anv_DestroyPipelineLayout(anv_device_to_handle(device),
+                             device->meta_state.blit.pipeline_layout,
+                             &device->meta_state.alloc);
+ fail_descriptor_set_layout:
+   anv_DestroyDescriptorSetLayout(anv_device_to_handle(device),
+                                  device->meta_state.blit.ds_layout,
+                                  &device->meta_state.alloc);
+ fail_render_pass:
+   anv_DestroyRenderPass(anv_device_to_handle(device),
+                         device->meta_state.blit.render_pass,
+                         &device->meta_state.alloc);
+
+   ralloc_free(vs.nir);
+   ralloc_free(fs_1d.nir);
+   ralloc_free(fs_2d.nir);
+   ralloc_free(fs_3d.nir);
+ fail:
+   return result;
+}
+
+static void
+meta_prepare_blit(struct anv_cmd_buffer *cmd_buffer,
+                  struct anv_meta_saved_state *saved_state)
+{
+   anv_meta_save(saved_state, cmd_buffer,
+                 (1 << VK_DYNAMIC_STATE_VIEWPORT));
+}
+
+struct blit_region {
+   VkOffset3D src_offset;
+   VkExtent3D src_extent;
+   VkOffset3D dest_offset;
+   VkExtent3D dest_extent;
+};
+
+static void
+meta_emit_blit(struct anv_cmd_buffer *cmd_buffer,
+               struct anv_image *src_image,
+               struct anv_image_view *src_iview,
+               VkOffset3D src_offset,
+               VkExtent3D src_extent,
+               struct anv_image *dest_image,
+               struct anv_image_view *dest_iview,
+               VkOffset3D dest_offset,
+               VkExtent3D dest_extent,
+               VkFilter blit_filter)
+{
+   struct anv_device *device = cmd_buffer->device;
+   VkDescriptorPool dummy_desc_pool = (VkDescriptorPool)1;
+
+   struct blit_vb_data {
+      float pos[2];
+      float tex_coord[3];
+   } *vb_data;
+
+   unsigned vb_size = sizeof(struct anv_vue_header) + 3 * sizeof(*vb_data);
+
+   struct anv_state vb_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, vb_size, 16);
+   memset(vb_state.map, 0, sizeof(struct anv_vue_header));
+   vb_data = vb_state.map + sizeof(struct anv_vue_header);
+
+   vb_data[0] = (struct blit_vb_data) {
+      .pos = {
+         dest_offset.x + dest_extent.width,
+         dest_offset.y + dest_extent.height,
+      },
+      .tex_coord = {
+         (float)(src_offset.x + src_extent.width) / (float)src_iview->extent.width,
+         (float)(src_offset.y + src_extent.height) / (float)src_iview->extent.height,
+         (float)src_offset.z / (float)src_iview->extent.depth,
+      },
+   };
+
+   vb_data[1] = (struct blit_vb_data) {
+      .pos = {
+         dest_offset.x,
+         dest_offset.y + dest_extent.height,
+      },
+      .tex_coord = {
+         (float)src_offset.x / (float)src_iview->extent.width,
+         (float)(src_offset.y + src_extent.height) / (float)src_iview->extent.height,
+         (float)src_offset.z / (float)src_iview->extent.depth,
+      },
+   };
+
+   vb_data[2] = (struct blit_vb_data) {
+      .pos = {
+         dest_offset.x,
+         dest_offset.y,
+      },
+      .tex_coord = {
+         (float)src_offset.x / (float)src_iview->extent.width,
+         (float)src_offset.y / (float)src_iview->extent.height,
+         (float)src_offset.z / (float)src_iview->extent.depth,
+      },
+   };
+
+   anv_state_clflush(vb_state);
+
+   struct anv_buffer vertex_buffer = {
+      .device = device,
+      .size = vb_size,
+      .bo = &device->dynamic_state_block_pool.bo,
+      .offset = vb_state.offset,
+   };
+
+   anv_CmdBindVertexBuffers(anv_cmd_buffer_to_handle(cmd_buffer), 0, 2,
+      (VkBuffer[]) {
+         anv_buffer_to_handle(&vertex_buffer),
+         anv_buffer_to_handle(&vertex_buffer)
+      },
+      (VkDeviceSize[]) {
+         0,
+         sizeof(struct anv_vue_header),
+      });
+
+   VkSampler sampler;
+   ANV_CALL(CreateSampler)(anv_device_to_handle(device),
+      &(VkSamplerCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+         .magFilter = blit_filter,
+         .minFilter = blit_filter,
+      }, &cmd_buffer->pool->alloc, &sampler);
+
+   VkDescriptorSet set;
+   anv_AllocateDescriptorSets(anv_device_to_handle(device),
+      &(VkDescriptorSetAllocateInfo) {
+         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+         .descriptorPool = dummy_desc_pool,
+         .descriptorSetCount = 1,
+         .pSetLayouts = &device->meta_state.blit.ds_layout
+      }, &set);
+   anv_UpdateDescriptorSets(anv_device_to_handle(device),
+      1, /* writeCount */
+      (VkWriteDescriptorSet[]) {
+         {
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = set,
+            .dstBinding = 0,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .pImageInfo = (VkDescriptorImageInfo[]) {
+               {
+                  .sampler = sampler,
+                  .imageView = anv_image_view_to_handle(src_iview),
+                  .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+               },
+            }
+         }
+      }, 0, NULL);
+
+   VkFramebuffer fb;
+   anv_CreateFramebuffer(anv_device_to_handle(device),
+      &(VkFramebufferCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+         .attachmentCount = 1,
+         .pAttachments = (VkImageView[]) {
+            anv_image_view_to_handle(dest_iview),
+         },
+         .width = dest_iview->extent.width,
+         .height = dest_iview->extent.height,
+         .layers = 1
+      }, &cmd_buffer->pool->alloc, &fb);
+
+   ANV_CALL(CmdBeginRenderPass)(anv_cmd_buffer_to_handle(cmd_buffer),
+      &(VkRenderPassBeginInfo) {
+         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+         .renderPass = device->meta_state.blit.render_pass,
+         .framebuffer = fb,
+         .renderArea = {
+            .offset = { dest_offset.x, dest_offset.y },
+            .extent = { dest_extent.width, dest_extent.height },
+         },
+         .clearValueCount = 0,
+         .pClearValues = NULL,
+      }, VK_SUBPASS_CONTENTS_INLINE);
+
+   VkPipeline pipeline;
+
+   switch (src_image->type) {
+   case VK_IMAGE_TYPE_1D:
+      pipeline = device->meta_state.blit.pipeline_1d_src;
+      break;
+   case VK_IMAGE_TYPE_2D:
+      pipeline = device->meta_state.blit.pipeline_2d_src;
+      break;
+   case VK_IMAGE_TYPE_3D:
+      pipeline = device->meta_state.blit.pipeline_3d_src;
+      break;
+   default:
+      unreachable(!"bad VkImageType");
+   }
+
+   if (cmd_buffer->state.pipeline != anv_pipeline_from_handle(pipeline)) {
+      anv_CmdBindPipeline(anv_cmd_buffer_to_handle(cmd_buffer),
+                          VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
+   }
+
+   anv_CmdSetViewport(anv_cmd_buffer_to_handle(cmd_buffer), 0, 1,
+                      &(VkViewport) {
+                        .x = 0.0f,
+                        .y = 0.0f,
+                        .width = dest_iview->extent.width,
+                        .height = dest_iview->extent.height,
+                        .minDepth = 0.0f,
+                        .maxDepth = 1.0f,
+                      });
+
+   anv_CmdBindDescriptorSets(anv_cmd_buffer_to_handle(cmd_buffer),
+                             VK_PIPELINE_BIND_POINT_GRAPHICS,
+                             device->meta_state.blit.pipeline_layout, 0, 1,
+                             &set, 0, NULL);
+
+   ANV_CALL(CmdDraw)(anv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
+
+   ANV_CALL(CmdEndRenderPass)(anv_cmd_buffer_to_handle(cmd_buffer));
+
+   /* At the point where we emit the draw call, all data from the
+    * descriptor sets, etc. has been used.  We are free to delete it.
+    */
+   anv_descriptor_set_destroy(device, anv_descriptor_set_from_handle(set));
+   anv_DestroySampler(anv_device_to_handle(device), sampler,
+                      &cmd_buffer->pool->alloc);
+   anv_DestroyFramebuffer(anv_device_to_handle(device), fb,
+                          &cmd_buffer->pool->alloc);
+}
+
+static void
+meta_finish_blit(struct anv_cmd_buffer *cmd_buffer,
+                 const struct anv_meta_saved_state *saved_state)
+{
+   anv_meta_restore(saved_state, cmd_buffer);
+}
+
+static VkFormat
+vk_format_for_size(int bs)
+{
+   /* Note: We intentionally use the 4-channel formats whenever we can.
+    * This is so that, when we do a RGB <-> RGBX copy, the two formats will
+    * line up even though one of them is 3/4 the size of the other.
+    */
+   switch (bs) {
+   case 1: return VK_FORMAT_R8_UINT;
+   case 2: return VK_FORMAT_R8G8_UINT;
+   case 3: return VK_FORMAT_R8G8B8_UINT;
+   case 4: return VK_FORMAT_R8G8B8A8_UINT;
+   case 6: return VK_FORMAT_R16G16B16_UINT;
+   case 8: return VK_FORMAT_R16G16B16A16_UINT;
+   case 12: return VK_FORMAT_R32G32B32_UINT;
+   case 16: return VK_FORMAT_R32G32B32A32_UINT;
+   default:
+      unreachable("Invalid format block size");
+   }
+}
+
+static void
+do_buffer_copy(struct anv_cmd_buffer *cmd_buffer,
+               struct anv_bo *src, uint64_t src_offset,
+               struct anv_bo *dest, uint64_t dest_offset,
+               int width, int height, VkFormat copy_format)
+{
+   VkDevice vk_device = anv_device_to_handle(cmd_buffer->device);
+
+   VkImageCreateInfo image_info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+      .imageType = VK_IMAGE_TYPE_2D,
+      .format = copy_format,
+      .extent = {
+         .width = width,
+         .height = height,
+         .depth = 1,
+      },
+      .mipLevels = 1,
+      .arrayLayers = 1,
+      .samples = 1,
+      .tiling = VK_IMAGE_TILING_LINEAR,
+      .usage = 0,
+      .flags = 0,
+   };
+
+   VkImage src_image;
+   image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
+   anv_CreateImage(vk_device, &image_info,
+                   &cmd_buffer->pool->alloc, &src_image);
+
+   VkImage dest_image;
+   image_info.usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+   anv_CreateImage(vk_device, &image_info,
+                   &cmd_buffer->pool->alloc, &dest_image);
+
+   /* We could use a vk call to bind memory, but that would require
+    * creating a dummy memory object etc. so there's really no point.
+    */
+   anv_image_from_handle(src_image)->bo = src;
+   anv_image_from_handle(src_image)->offset = src_offset;
+   anv_image_from_handle(dest_image)->bo = dest;
+   anv_image_from_handle(dest_image)->offset = dest_offset;
+
+   struct anv_image_view src_iview;
+   anv_image_view_init(&src_iview, cmd_buffer->device,
+      &(VkImageViewCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+         .image = src_image,
+         .viewType = VK_IMAGE_VIEW_TYPE_2D,
+         .format = copy_format,
+         .subresourceRange = {
+            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = 1
+         },
+      },
+      cmd_buffer);
+
+   struct anv_image_view dest_iview;
+   anv_image_view_init(&dest_iview, cmd_buffer->device,
+      &(VkImageViewCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+         .image = dest_image,
+         .viewType = VK_IMAGE_VIEW_TYPE_2D,
+         .format = copy_format,
+         .subresourceRange = {
+            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+         },
+      },
+      cmd_buffer);
+
+   meta_emit_blit(cmd_buffer,
+                  anv_image_from_handle(src_image),
+                  &src_iview,
+                  (VkOffset3D) { 0, 0, 0 },
+                  (VkExtent3D) { width, height, 1 },
+                  anv_image_from_handle(dest_image),
+                  &dest_iview,
+                  (VkOffset3D) { 0, 0, 0 },
+                  (VkExtent3D) { width, height, 1 },
+                  VK_FILTER_NEAREST);
+
+   anv_DestroyImage(vk_device, src_image, &cmd_buffer->pool->alloc);
+   anv_DestroyImage(vk_device, dest_image, &cmd_buffer->pool->alloc);
+}
+
+void anv_CmdCopyBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    srcBuffer,
+    VkBuffer                                    destBuffer,
+    uint32_t                                    regionCount,
+    const VkBufferCopy*                         pRegions)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, src_buffer, srcBuffer);
+   ANV_FROM_HANDLE(anv_buffer, dest_buffer, destBuffer);
+
+   struct anv_meta_saved_state saved_state;
+
+   meta_prepare_blit(cmd_buffer, &saved_state);
+
+   for (unsigned r = 0; r < regionCount; r++) {
+      uint64_t src_offset = src_buffer->offset + pRegions[r].srcOffset;
+      uint64_t dest_offset = dest_buffer->offset + pRegions[r].dstOffset;
+      uint64_t copy_size = pRegions[r].size;
+
+      /* First, we compute the biggest format that can be used with the
+       * given offsets and size.
+       */
+      int bs = 16;
+
+      int fs = ffs(src_offset) - 1;
+      if (fs != -1)
+         bs = MIN2(bs, 1 << fs);
+      assert(src_offset % bs == 0);
+
+      fs = ffs(dest_offset) - 1;
+      if (fs != -1)
+         bs = MIN2(bs, 1 << fs);
+      assert(dest_offset % bs == 0);
+
+      fs = ffs(pRegions[r].size) - 1;
+      if (fs != -1)
+         bs = MIN2(bs, 1 << fs);
+      assert(pRegions[r].size % bs == 0);
+
+      VkFormat copy_format = vk_format_for_size(bs);
+
+      /* This is maximum possible width/height our HW can handle */
+      uint64_t max_surface_dim = 1 << 14;
+
+      /* First, we make a bunch of max-sized copies */
+      uint64_t max_copy_size = max_surface_dim * max_surface_dim * bs;
+      while (copy_size > max_copy_size) {
+         do_buffer_copy(cmd_buffer, src_buffer->bo, src_offset,
+                        dest_buffer->bo, dest_offset,
+                        max_surface_dim, max_surface_dim, copy_format);
+         copy_size -= max_copy_size;
+         src_offset += max_copy_size;
+         dest_offset += max_copy_size;
+      }
+
+      uint64_t height = copy_size / (max_surface_dim * bs);
+      assert(height < max_surface_dim);
+      if (height != 0) {
+         uint64_t rect_copy_size = height * max_surface_dim * bs;
+         do_buffer_copy(cmd_buffer, src_buffer->bo, src_offset,
+                        dest_buffer->bo, dest_offset,
+                        max_surface_dim, height, copy_format);
+         copy_size -= rect_copy_size;
+         src_offset += rect_copy_size;
+         dest_offset += rect_copy_size;
+      }
+
+      if (copy_size != 0) {
+         do_buffer_copy(cmd_buffer, src_buffer->bo, src_offset,
+                        dest_buffer->bo, dest_offset,
+                        copy_size / bs, 1, copy_format);
+      }
+   }
+
+   meta_finish_blit(cmd_buffer, &saved_state);
+}
+
+void anv_CmdUpdateBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                dataSize,
+    const uint32_t*                             pData)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
+   struct anv_meta_saved_state saved_state;
+
+   meta_prepare_blit(cmd_buffer, &saved_state);
+
+   /* We can't quite grab a full block because the state stream needs a
+    * little data at the top to build its linked list.
+    */
+   const uint32_t max_update_size =
+      cmd_buffer->device->dynamic_state_block_pool.block_size - 64;
+
+   assert(max_update_size < (1 << 14) * 4);
+
+   while (dataSize) {
+      const uint32_t copy_size = MIN2(dataSize, max_update_size);
+
+      struct anv_state tmp_data =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, copy_size, 64);
+
+      memcpy(tmp_data.map, pData, copy_size);
+
+      VkFormat format;
+      int bs;
+      if ((copy_size & 15) == 0 && (dstOffset & 15) == 0) {
+         format = VK_FORMAT_R32G32B32A32_UINT;
+         bs = 16;
+      } else if ((copy_size & 7) == 0 && (dstOffset & 7) == 0) {
+         format = VK_FORMAT_R32G32_UINT;
+         bs = 8;
+      } else {
+         assert((copy_size & 3) == 0 && (dstOffset & 3) == 0);
+         format = VK_FORMAT_R32_UINT;
+         bs = 4;
+      }
+
+      do_buffer_copy(cmd_buffer,
+                     &cmd_buffer->device->dynamic_state_block_pool.bo,
+                     tmp_data.offset,
+                     dst_buffer->bo, dst_buffer->offset + dstOffset,
+                     copy_size / bs, 1, format);
+
+      dataSize -= copy_size;
+      dstOffset += copy_size;
+      pData = (void *)pData + copy_size;
+   }
+}
+
+static VkFormat
+choose_iview_format(struct anv_image *image, VkImageAspectFlagBits aspect)
+{
+   assert(__builtin_popcount(aspect) == 1);
+
+   struct isl_surf *surf =
+      &anv_image_get_surface_for_aspect_mask(image, aspect)->isl;
+
+   /* vkCmdCopyImage behaves like memcpy. Therefore we choose identical UINT
+    * formats for the source and destination image views.
+    *
+    * From the Vulkan spec (2015-12-30):
+    *
+    *    vkCmdCopyImage performs image copies in a similar manner to a host
+    *    memcpy. It does not perform general-purpose conversions such as
+    *    scaling, resizing, blending, color-space conversion, or format
+    *    conversions.  Rather, it simply copies raw image data. vkCmdCopyImage
+    *    can copy between images with different formats, provided the formats
+    *    are compatible as defined below.
+    *
+    *    [The spec later defines compatibility as having the same number of
+    *    bytes per block].
+    */
+   return vk_format_for_size(isl_format_layouts[surf->format].bs);
+}
+
+static VkFormat
+choose_buffer_format(struct anv_image *image, VkImageAspectFlagBits aspect)
+{
+   assert(__builtin_popcount(aspect) == 1);
+
+   /* vkCmdCopy* commands behave like memcpy. Therefore we choose
+    * compatable UINT formats for the source and destination image views.
+    *
+    * For the buffer, we go back to the original image format and get a
+    * the format as if it were linear.  This way, for RGB formats, we get
+    * an RGB format here even if the tiled image is RGBA. XXX: This doesn't
+    * work if the buffer is the destination.
+    */
+   enum isl_format linear_format = anv_get_isl_format(image->vk_format, aspect,
+                                                      VK_IMAGE_TILING_LINEAR);
+
+   return vk_format_for_size(isl_format_layouts[linear_format].bs);
+}
+
+void anv_CmdCopyImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     destImage,
+    VkImageLayout                               destImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageCopy*                          pRegions)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, src_image, srcImage);
+   ANV_FROM_HANDLE(anv_image, dest_image, destImage);
+
+   struct anv_meta_saved_state saved_state;
+
+   meta_prepare_blit(cmd_buffer, &saved_state);
+
+   for (unsigned r = 0; r < regionCount; r++) {
+      assert(pRegions[r].srcSubresource.aspectMask ==
+             pRegions[r].dstSubresource.aspectMask);
+
+      VkImageAspectFlags aspect = pRegions[r].srcSubresource.aspectMask;
+
+      VkFormat src_format = choose_iview_format(src_image, aspect);
+      VkFormat dst_format = choose_iview_format(dest_image, aspect);
+
+      struct anv_image_view src_iview;
+      anv_image_view_init(&src_iview, cmd_buffer->device,
+         &(VkImageViewCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .image = srcImage,
+            .viewType = anv_meta_get_view_type(src_image),
+            .format = src_format,
+            .subresourceRange = {
+               .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+               .baseMipLevel = pRegions[r].srcSubresource.mipLevel,
+               .levelCount = 1,
+               .baseArrayLayer = pRegions[r].srcSubresource.baseArrayLayer,
+               .layerCount = pRegions[r].dstSubresource.layerCount,
+            },
+         },
+         cmd_buffer);
+
+      const VkOffset3D dest_offset = {
+         .x = pRegions[r].dstOffset.x,
+         .y = pRegions[r].dstOffset.y,
+         .z = 0,
+      };
+
+      unsigned num_slices;
+      if (src_image->type == VK_IMAGE_TYPE_3D) {
+         assert(pRegions[r].srcSubresource.layerCount == 1 &&
+                pRegions[r].dstSubresource.layerCount == 1);
+         num_slices = pRegions[r].extent.depth;
+      } else {
+         assert(pRegions[r].srcSubresource.layerCount ==
+                pRegions[r].dstSubresource.layerCount);
+         assert(pRegions[r].extent.depth == 1);
+         num_slices = pRegions[r].dstSubresource.layerCount;
+      }
+
+      const uint32_t dest_base_array_slice =
+         meta_blit_get_dest_view_base_array_slice(dest_image,
+                                                  &pRegions[r].dstSubresource,
+                                                  &pRegions[r].dstOffset);
+
+      for (unsigned slice = 0; slice < num_slices; slice++) {
+         VkOffset3D src_offset = pRegions[r].srcOffset;
+         src_offset.z += slice;
+
+         struct anv_image_view dest_iview;
+         anv_image_view_init(&dest_iview, cmd_buffer->device,
+            &(VkImageViewCreateInfo) {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .image = destImage,
+               .viewType = anv_meta_get_view_type(dest_image),
+               .format = dst_format,
+               .subresourceRange = {
+                  .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                  .baseMipLevel = pRegions[r].dstSubresource.mipLevel,
+                  .levelCount = 1,
+                  .baseArrayLayer = dest_base_array_slice + slice,
+                  .layerCount = 1
+               },
+            },
+            cmd_buffer);
+
+         meta_emit_blit(cmd_buffer,
+                        src_image, &src_iview,
+                        src_offset,
+                        pRegions[r].extent,
+                        dest_image, &dest_iview,
+                        dest_offset,
+                        pRegions[r].extent,
+                        VK_FILTER_NEAREST);
+      }
+   }
+
+   meta_finish_blit(cmd_buffer, &saved_state);
+}
+
+void anv_CmdBlitImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     destImage,
+    VkImageLayout                               destImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageBlit*                          pRegions,
+    VkFilter                                    filter)
+
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, src_image, srcImage);
+   ANV_FROM_HANDLE(anv_image, dest_image, destImage);
+
+   struct anv_meta_saved_state saved_state;
+
+   anv_finishme("respect VkFilter");
+
+   meta_prepare_blit(cmd_buffer, &saved_state);
+
+   for (unsigned r = 0; r < regionCount; r++) {
+      struct anv_image_view src_iview;
+      anv_image_view_init(&src_iview, cmd_buffer->device,
+         &(VkImageViewCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .image = srcImage,
+            .viewType = anv_meta_get_view_type(src_image),
+            .format = src_image->vk_format,
+            .subresourceRange = {
+               .aspectMask = pRegions[r].srcSubresource.aspectMask,
+               .baseMipLevel = pRegions[r].srcSubresource.mipLevel,
+               .levelCount = 1,
+               .baseArrayLayer = pRegions[r].srcSubresource.baseArrayLayer,
+               .layerCount = 1
+            },
+         },
+         cmd_buffer);
+
+      const VkOffset3D dest_offset = {
+         .x = pRegions[r].dstOffsets[0].x,
+         .y = pRegions[r].dstOffsets[0].y,
+         .z = 0,
+      };
+
+      if (pRegions[r].dstOffsets[1].x < pRegions[r].dstOffsets[0].x ||
+          pRegions[r].dstOffsets[1].y < pRegions[r].dstOffsets[0].y ||
+          pRegions[r].srcOffsets[1].x < pRegions[r].srcOffsets[0].x ||
+          pRegions[r].srcOffsets[1].y < pRegions[r].srcOffsets[0].y)
+         anv_finishme("FINISHME: Allow flipping in blits");
+
+      const VkExtent3D dest_extent = {
+         .width = pRegions[r].dstOffsets[1].x - pRegions[r].dstOffsets[0].x,
+         .height = pRegions[r].dstOffsets[1].y - pRegions[r].dstOffsets[0].y,
+      };
+
+      const VkExtent3D src_extent = {
+         .width = pRegions[r].srcOffsets[1].x - pRegions[r].srcOffsets[0].x,
+         .height = pRegions[r].srcOffsets[1].y - pRegions[r].srcOffsets[0].y,
+      };
+
+      const uint32_t dest_array_slice =
+         meta_blit_get_dest_view_base_array_slice(dest_image,
+                                                  &pRegions[r].dstSubresource,
+                                                  &pRegions[r].dstOffsets[0]);
+
+      if (pRegions[r].srcSubresource.layerCount > 1)
+         anv_finishme("FINISHME: copy multiple array layers");
+
+      if (pRegions[r].srcOffsets[0].z + 1 != pRegions[r].srcOffsets[1].z ||
+          pRegions[r].dstOffsets[0].z + 1 != pRegions[r].dstOffsets[1].z)
+         anv_finishme("FINISHME: copy multiple depth layers");
+
+      struct anv_image_view dest_iview;
+      anv_image_view_init(&dest_iview, cmd_buffer->device,
+         &(VkImageViewCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .image = destImage,
+            .viewType = anv_meta_get_view_type(dest_image),
+            .format = dest_image->vk_format,
+            .subresourceRange = {
+               .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+               .baseMipLevel = pRegions[r].dstSubresource.mipLevel,
+               .levelCount = 1,
+               .baseArrayLayer = dest_array_slice,
+               .layerCount = 1
+            },
+         },
+         cmd_buffer);
+
+      meta_emit_blit(cmd_buffer,
+                     src_image, &src_iview,
+                     pRegions[r].srcOffsets[0], src_extent,
+                     dest_image, &dest_iview,
+                     dest_offset, dest_extent,
+                     filter);
+   }
+
+   meta_finish_blit(cmd_buffer, &saved_state);
+}
+
+static struct anv_image *
+make_image_for_buffer(VkDevice vk_device, VkBuffer vk_buffer, VkFormat format,
+                      VkImageUsageFlags usage,
+                      VkImageType image_type,
+                      const VkAllocationCallbacks *alloc,
+                      const VkBufferImageCopy *copy)
+{
+   ANV_FROM_HANDLE(anv_buffer, buffer, vk_buffer);
+
+   VkExtent3D extent = copy->imageExtent;
+   if (copy->bufferRowLength)
+      extent.width = copy->bufferRowLength;
+   if (copy->bufferImageHeight)
+      extent.height = copy->bufferImageHeight;
+   extent.depth = 1;
+
+   VkImage vk_image;
+   VkResult result = anv_CreateImage(vk_device,
+      &(VkImageCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+         .imageType = VK_IMAGE_TYPE_2D,
+         .format = format,
+         .extent = extent,
+         .mipLevels = 1,
+         .arrayLayers = 1,
+         .samples = 1,
+         .tiling = VK_IMAGE_TILING_LINEAR,
+         .usage = usage,
+         .flags = 0,
+      }, alloc, &vk_image);
+   assert(result == VK_SUCCESS);
+
+   ANV_FROM_HANDLE(anv_image, image, vk_image);
+
+   /* We could use a vk call to bind memory, but that would require
+    * creating a dummy memory object etc. so there's really no point.
+    */
+   image->bo = buffer->bo;
+   image->offset = buffer->offset + copy->bufferOffset;
+
+   return image;
+}
+
+void anv_CmdCopyBufferToImage(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    srcBuffer,
+    VkImage                                     destImage,
+    VkImageLayout                               destImageLayout,
+    uint32_t                                    regionCount,
+    const VkBufferImageCopy*                    pRegions)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, dest_image, destImage);
+   VkDevice vk_device = anv_device_to_handle(cmd_buffer->device);
+   struct anv_meta_saved_state saved_state;
+
+   meta_prepare_blit(cmd_buffer, &saved_state);
+
+   for (unsigned r = 0; r < regionCount; r++) {
+      VkImageAspectFlags aspect = pRegions[r].imageSubresource.aspectMask;
+
+      VkFormat image_format = choose_iview_format(dest_image, aspect);
+      VkFormat buffer_format = choose_buffer_format(dest_image, aspect);
+
+      struct anv_image *src_image =
+         make_image_for_buffer(vk_device, srcBuffer, buffer_format,
+                               VK_IMAGE_USAGE_SAMPLED_BIT,
+                               dest_image->type, &cmd_buffer->pool->alloc,
+                               &pRegions[r]);
+
+      const uint32_t dest_base_array_slice =
+         meta_blit_get_dest_view_base_array_slice(dest_image,
+                                                  &pRegions[r].imageSubresource,
+                                                  &pRegions[r].imageOffset);
+
+      unsigned num_slices;
+      if (dest_image->type == VK_IMAGE_TYPE_3D) {
+         assert(pRegions[r].imageSubresource.layerCount == 1);
+         num_slices = pRegions[r].imageExtent.depth;
+      } else {
+         assert(pRegions[r].imageExtent.depth == 1);
+         num_slices = pRegions[r].imageSubresource.layerCount;
+      }
+
+      for (unsigned slice = 0; slice < num_slices; slice++) {
+         struct anv_image_view src_iview;
+         anv_image_view_init(&src_iview, cmd_buffer->device,
+            &(VkImageViewCreateInfo) {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .image = anv_image_to_handle(src_image),
+               .viewType = VK_IMAGE_VIEW_TYPE_2D,
+               .format = buffer_format,
+               .subresourceRange = {
+                  .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                  .baseMipLevel = 0,
+                  .levelCount = 1,
+                  .baseArrayLayer = 0,
+                  .layerCount = 1,
+               },
+            },
+            cmd_buffer);
+
+         struct anv_image_view dest_iview;
+         anv_image_view_init(&dest_iview, cmd_buffer->device,
+            &(VkImageViewCreateInfo) {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .image = anv_image_to_handle(dest_image),
+               .viewType = anv_meta_get_view_type(dest_image),
+               .format = image_format,
+               .subresourceRange = {
+                  .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                  .baseMipLevel = pRegions[r].imageSubresource.mipLevel,
+                  .levelCount = 1,
+                  .baseArrayLayer = dest_base_array_slice + slice,
+                  .layerCount = 1
+               },
+            },
+            cmd_buffer);
+
+         VkOffset3D src_offset = { 0, 0, slice };
+
+         const VkOffset3D dest_offset = {
+            .x = pRegions[r].imageOffset.x,
+            .y = pRegions[r].imageOffset.y,
+            .z = 0,
+         };
+
+         meta_emit_blit(cmd_buffer,
+                        src_image,
+                        &src_iview,
+                        src_offset,
+                        pRegions[r].imageExtent,
+                        dest_image,
+                        &dest_iview,
+                        dest_offset,
+                        pRegions[r].imageExtent,
+                        VK_FILTER_NEAREST);
+
+         /* Once we've done the blit, all of the actual information about
+          * the image is embedded in the command buffer so we can just
+          * increment the offset directly in the image effectively
+          * re-binding it to different backing memory.
+          */
+         src_image->offset += src_image->extent.width *
+                              src_image->extent.height *
+                              src_image->format->isl_layout->bs;
+      }
+
+      anv_DestroyImage(vk_device, anv_image_to_handle(src_image),
+                       &cmd_buffer->pool->alloc);
+   }
+
+   meta_finish_blit(cmd_buffer, &saved_state);
+}
+
+void anv_CmdCopyImageToBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkBuffer                                    destBuffer,
+    uint32_t                                    regionCount,
+    const VkBufferImageCopy*                    pRegions)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, src_image, srcImage);
+   VkDevice vk_device = anv_device_to_handle(cmd_buffer->device);
+   struct anv_meta_saved_state saved_state;
+
+   meta_prepare_blit(cmd_buffer, &saved_state);
+
+   for (unsigned r = 0; r < regionCount; r++) {
+      VkImageAspectFlags aspect = pRegions[r].imageSubresource.aspectMask;
+
+      VkFormat image_format = choose_iview_format(src_image, aspect);
+      VkFormat buffer_format = choose_buffer_format(src_image, aspect);
+
+      struct anv_image_view src_iview;
+      anv_image_view_init(&src_iview, cmd_buffer->device,
+         &(VkImageViewCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .image = srcImage,
+            .viewType = anv_meta_get_view_type(src_image),
+            .format = image_format,
+            .subresourceRange = {
+               .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+               .baseMipLevel = pRegions[r].imageSubresource.mipLevel,
+               .levelCount = 1,
+               .baseArrayLayer = pRegions[r].imageSubresource.baseArrayLayer,
+               .layerCount = pRegions[r].imageSubresource.layerCount,
+            },
+         },
+         cmd_buffer);
+
+      struct anv_image *dest_image =
+         make_image_for_buffer(vk_device, destBuffer, buffer_format,
+                               VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+                               src_image->type, &cmd_buffer->pool->alloc,
+                               &pRegions[r]);
+
+      unsigned num_slices;
+      if (src_image->type == VK_IMAGE_TYPE_3D) {
+         assert(pRegions[r].imageSubresource.layerCount == 1);
+         num_slices = pRegions[r].imageExtent.depth;
+      } else {
+         assert(pRegions[r].imageExtent.depth == 1);
+         num_slices = pRegions[r].imageSubresource.layerCount;
+      }
+
+      for (unsigned slice = 0; slice < num_slices; slice++) {
+         VkOffset3D src_offset = pRegions[r].imageOffset;
+         src_offset.z += slice;
+
+         struct anv_image_view dest_iview;
+         anv_image_view_init(&dest_iview, cmd_buffer->device,
+            &(VkImageViewCreateInfo) {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .image = anv_image_to_handle(dest_image),
+               .viewType = VK_IMAGE_VIEW_TYPE_2D,
+               .format = buffer_format,
+               .subresourceRange = {
+                  .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                  .baseMipLevel = 0,
+                  .levelCount = 1,
+                  .baseArrayLayer = 0,
+                  .layerCount = 1
+               },
+            },
+            cmd_buffer);
+
+         meta_emit_blit(cmd_buffer,
+                        anv_image_from_handle(srcImage),
+                        &src_iview,
+                        src_offset,
+                        pRegions[r].imageExtent,
+                        dest_image,
+                        &dest_iview,
+                        (VkOffset3D) { 0, 0, 0 },
+                        pRegions[r].imageExtent,
+                        VK_FILTER_NEAREST);
+
+         /* Once we've done the blit, all of the actual information about
+          * the image is embedded in the command buffer so we can just
+          * increment the offset directly in the image effectively
+          * re-binding it to different backing memory.
+          */
+         dest_image->offset += dest_image->extent.width *
+                               dest_image->extent.height *
+                               src_image->format->isl_layout->bs;
+      }
+
+      anv_DestroyImage(vk_device, anv_image_to_handle(dest_image),
+                       &cmd_buffer->pool->alloc);
+   }
+
+   meta_finish_blit(cmd_buffer, &saved_state);
+}
+
+void anv_CmdResolveImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     srcImage,
+    VkImageLayout                               srcImageLayout,
+    VkImage                                     destImage,
+    VkImageLayout                               destImageLayout,
+    uint32_t                                    regionCount,
+    const VkImageResolve*                       pRegions)
+{
+   stub();
+}
+
+static void *
+meta_alloc(void* _device, size_t size, size_t alignment,
+           VkSystemAllocationScope allocationScope)
+{
+   struct anv_device *device = _device;
+   return device->alloc.pfnAllocation(device->alloc.pUserData, size, alignment,
+                                      VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+}
+
+static void *
+meta_realloc(void* _device, void *original, size_t size, size_t alignment,
+             VkSystemAllocationScope allocationScope)
+{
+   struct anv_device *device = _device;
+   return device->alloc.pfnReallocation(device->alloc.pUserData, original,
+                                        size, alignment,
+                                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+}
+
+static void
+meta_free(void* _device, void *data)
+{
+   struct anv_device *device = _device;
+   return device->alloc.pfnFree(device->alloc.pUserData, data);
+}
+
+VkResult
+anv_device_init_meta(struct anv_device *device)
+{
+   device->meta_state.alloc = (VkAllocationCallbacks) {
+      .pUserData = device,
+      .pfnAllocation = meta_alloc,
+      .pfnReallocation = meta_realloc,
+      .pfnFree = meta_free,
+   };
+
+   VkResult result;
+   result = anv_device_init_meta_clear_state(device);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = anv_device_init_meta_blit_state(device);
+   if (result != VK_SUCCESS) {
+      anv_device_finish_meta_clear_state(device);
+      return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+anv_device_finish_meta(struct anv_device *device)
+{
+   anv_device_finish_meta_clear_state(device);
+
+   /* Blit */
+   anv_DestroyRenderPass(anv_device_to_handle(device),
+                         device->meta_state.blit.render_pass,
+                         &device->meta_state.alloc);
+   anv_DestroyPipeline(anv_device_to_handle(device),
+                       device->meta_state.blit.pipeline_1d_src,
+                       &device->meta_state.alloc);
+   anv_DestroyPipeline(anv_device_to_handle(device),
+                       device->meta_state.blit.pipeline_2d_src,
+                       &device->meta_state.alloc);
+   anv_DestroyPipeline(anv_device_to_handle(device),
+                       device->meta_state.blit.pipeline_3d_src,
+                       &device->meta_state.alloc);
+   anv_DestroyPipelineLayout(anv_device_to_handle(device),
+                             device->meta_state.blit.pipeline_layout,
+                             &device->meta_state.alloc);
+   anv_DestroyDescriptorSetLayout(anv_device_to_handle(device),
+                                  device->meta_state.blit.ds_layout,
+                                  &device->meta_state.alloc);
+}
diff --git a/src/vulkan/anv_meta.h b/src/vulkan/anv_meta.h

new file mode 100644 (file)

index 0000000..c8d025b
--- /dev/null
+++ b/src/vulkan/anv_meta.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "anv_private.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ANV_META_VERTEX_BINDING_COUNT 2
+
+struct anv_meta_saved_state {
+   struct anv_vertex_binding old_vertex_bindings[ANV_META_VERTEX_BINDING_COUNT];
+   struct anv_descriptor_set *old_descriptor_set0;
+   struct anv_pipeline *old_pipeline;
+
+   /**
+    * Bitmask of (1 << VK_DYNAMIC_STATE_*). Defines the set of saved dynamic
+    * state.
+    */
+   uint32_t dynamic_mask;
+   struct anv_dynamic_state dynamic;
+};
+
+void
+anv_meta_save(struct anv_meta_saved_state *state,
+              const struct anv_cmd_buffer *cmd_buffer,
+              uint32_t dynamic_mask);
+
+void
+anv_meta_restore(const struct anv_meta_saved_state *state,
+                 struct anv_cmd_buffer *cmd_buffer);
+
+VkImageViewType
+anv_meta_get_view_type(const struct anv_image *image);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/vulkan/anv_meta_clear.c b/src/vulkan/anv_meta_clear.c

new file mode 100644 (file)

index 0000000..6ba27b9
--- /dev/null
+++ b/src/vulkan/anv_meta_clear.c
@@ -0,0 +1,1131 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_meta.h"
+#include "anv_meta_clear.h"
+#include "anv_private.h"
+#include "glsl/nir/nir_builder.h"
+
+/** Vertex attributes for color clears.  */
+struct color_clear_vattrs {
+   struct anv_vue_header vue_header;
+   float position[2]; /**< 3DPRIM_RECTLIST */
+   VkClearColorValue color;
+};
+
+/** Vertex attributes for depthstencil clears.  */
+struct depthstencil_clear_vattrs {
+   struct anv_vue_header vue_header;
+   float position[2]; /*<< 3DPRIM_RECTLIST */
+};
+
+static void
+meta_clear_begin(struct anv_meta_saved_state *saved_state,
+                 struct anv_cmd_buffer *cmd_buffer)
+{
+   anv_meta_save(saved_state, cmd_buffer,
+                 (1 << VK_DYNAMIC_STATE_VIEWPORT) |
+                 (1 << VK_DYNAMIC_STATE_SCISSOR) |
+                 (1 << VK_DYNAMIC_STATE_STENCIL_REFERENCE));
+
+   cmd_buffer->state.dynamic.viewport.count = 0;
+   cmd_buffer->state.dynamic.scissor.count = 0;
+}
+
+static void
+meta_clear_end(struct anv_meta_saved_state *saved_state,
+               struct anv_cmd_buffer *cmd_buffer)
+{
+   anv_meta_restore(saved_state, cmd_buffer);
+}
+
+static void
+build_color_shaders(struct nir_shader **out_vs,
+                    struct nir_shader **out_fs,
+                    uint32_t frag_output)
+{
+   nir_builder vs_b;
+   nir_builder fs_b;
+
+   nir_builder_init_simple_shader(&vs_b, NULL, MESA_SHADER_VERTEX, NULL);
+   nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL);
+
+   vs_b.shader->info.name = ralloc_strdup(vs_b.shader, "meta_clear_color_vs");
+   fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "meta_clear_color_fs");
+
+   const struct glsl_type *position_type = glsl_vec4_type();
+   const struct glsl_type *color_type = glsl_vec4_type();
+
+   nir_variable *vs_in_pos =
+      nir_variable_create(vs_b.shader, nir_var_shader_in, position_type,
+                          "a_position");
+   vs_in_pos->data.location = VERT_ATTRIB_GENERIC0;
+
+   nir_variable *vs_out_pos =
+      nir_variable_create(vs_b.shader, nir_var_shader_out, position_type,
+                          "gl_Position");
+   vs_out_pos->data.location = VARYING_SLOT_POS;
+
+   nir_variable *vs_in_color =
+      nir_variable_create(vs_b.shader, nir_var_shader_in, color_type,
+                          "a_color");
+   vs_in_color->data.location = VERT_ATTRIB_GENERIC1;
+
+   nir_variable *vs_out_color =
+      nir_variable_create(vs_b.shader, nir_var_shader_out, color_type,
+                          "v_color");
+   vs_out_color->data.location = VARYING_SLOT_VAR0;
+   vs_out_color->data.interpolation = INTERP_QUALIFIER_FLAT;
+
+   nir_variable *fs_in_color =
+      nir_variable_create(fs_b.shader, nir_var_shader_in, color_type,
+                          "v_color");
+   fs_in_color->data.location = vs_out_color->data.location;
+   fs_in_color->data.interpolation = vs_out_color->data.interpolation;
+
+   nir_variable *fs_out_color =
+      nir_variable_create(fs_b.shader, nir_var_shader_out, color_type,
+                          "f_color");
+   fs_out_color->data.location = FRAG_RESULT_DATA0 + frag_output;
+
+   nir_copy_var(&vs_b, vs_out_pos, vs_in_pos);
+   nir_copy_var(&vs_b, vs_out_color, vs_in_color);
+   nir_copy_var(&fs_b, fs_out_color, fs_in_color);
+
+   *out_vs = vs_b.shader;
+   *out_fs = fs_b.shader;
+}
+
+static VkResult
+create_pipeline(struct anv_device *device,
+                struct nir_shader *vs_nir,
+                struct nir_shader *fs_nir,
+                const VkPipelineVertexInputStateCreateInfo *vi_state,
+                const VkPipelineDepthStencilStateCreateInfo *ds_state,
+                const VkPipelineColorBlendStateCreateInfo *cb_state,
+                const VkAllocationCallbacks *alloc,
+                bool use_repclear,
+                struct anv_pipeline **pipeline)
+{
+   VkDevice device_h = anv_device_to_handle(device);
+   VkResult result;
+
+   struct anv_shader_module vs_m = { .nir = vs_nir };
+   struct anv_shader_module fs_m = { .nir = fs_nir };
+
+   VkPipeline pipeline_h = VK_NULL_HANDLE;
+   result = anv_graphics_pipeline_create(device_h,
+      VK_NULL_HANDLE,
+      &(VkGraphicsPipelineCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+         .stageCount = 2,
+         .pStages = (VkPipelineShaderStageCreateInfo[]) {
+            {
+               .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+               .stage = VK_SHADER_STAGE_VERTEX_BIT,
+               .module = anv_shader_module_to_handle(&vs_m),
+               .pName = "main",
+            },
+            {
+               .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+               .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+               .module = anv_shader_module_to_handle(&fs_m),
+               .pName = "main",
+            },
+         },
+         .pVertexInputState = vi_state,
+         .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+            .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+            .primitiveRestartEnable = false,
+         },
+         .pViewportState = &(VkPipelineViewportStateCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+            .viewportCount = 1,
+            .pViewports = NULL, /* dynamic */
+            .scissorCount = 1,
+            .pScissors = NULL, /* dynamic */
+         },
+         .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+            .rasterizerDiscardEnable = false,
+            .polygonMode = VK_POLYGON_MODE_FILL,
+            .cullMode = VK_CULL_MODE_NONE,
+            .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
+            .depthBiasEnable = false,
+         },
+         .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+            .rasterizationSamples = 1, /* FINISHME: Multisampling */
+            .sampleShadingEnable = false,
+            .pSampleMask = (VkSampleMask[]) { UINT32_MAX },
+            .alphaToCoverageEnable = false,
+            .alphaToOneEnable = false,
+         },
+         .pDepthStencilState = ds_state,
+         .pColorBlendState = cb_state,
+         .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+            /* The meta clear pipeline declares all state as dynamic.
+             * As a consequence, vkCmdBindPipeline writes no dynamic state
+             * to the cmd buffer. Therefore, at the end of the meta clear,
+             * we need only restore dynamic state was vkCmdSet.
+             */
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+            .dynamicStateCount = 9,
+            .pDynamicStates = (VkDynamicState[]) {
+               VK_DYNAMIC_STATE_VIEWPORT,
+               VK_DYNAMIC_STATE_SCISSOR,
+               VK_DYNAMIC_STATE_LINE_WIDTH,
+               VK_DYNAMIC_STATE_DEPTH_BIAS,
+               VK_DYNAMIC_STATE_BLEND_CONSTANTS,
+               VK_DYNAMIC_STATE_DEPTH_BOUNDS,
+               VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
+               VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
+               VK_DYNAMIC_STATE_STENCIL_REFERENCE,
+            },
+         },
+         .flags = 0,
+         .renderPass = anv_render_pass_to_handle(&anv_meta_dummy_renderpass),
+         .subpass = 0,
+      },
+      &(struct anv_graphics_pipeline_create_info) {
+         .color_attachment_count = MAX_RTS,
+         .use_repclear = use_repclear,
+         .disable_viewport = true,
+         .disable_vs = true,
+         .use_rectlist = true
+      },
+      alloc,
+      &pipeline_h);
+
+   ralloc_free(vs_nir);
+   ralloc_free(fs_nir);
+
+   *pipeline = anv_pipeline_from_handle(pipeline_h);
+
+   return result;
+}
+
+static VkResult
+create_color_pipeline(struct anv_device *device, uint32_t frag_output,
+                      struct anv_pipeline **pipeline)
+{
+   struct nir_shader *vs_nir;
+   struct nir_shader *fs_nir;
+   build_color_shaders(&vs_nir, &fs_nir, frag_output);
+
+   const VkPipelineVertexInputStateCreateInfo vi_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+      .vertexBindingDescriptionCount = 1,
+      .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
+         {
+            .binding = 0,
+            .stride = sizeof(struct color_clear_vattrs),
+            .inputRate = VK_VERTEX_INPUT_RATE_VERTEX
+         },
+      },
+      .vertexAttributeDescriptionCount = 3,
+      .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
+         {
+            /* VUE Header */
+            .location = 0,
+            .binding = 0,
+            .format = VK_FORMAT_R32G32B32A32_UINT,
+            .offset = offsetof(struct color_clear_vattrs, vue_header),
+         },
+         {
+            /* Position */
+            .location = 1,
+            .binding = 0,
+            .format = VK_FORMAT_R32G32_SFLOAT,
+            .offset = offsetof(struct color_clear_vattrs, position),
+         },
+         {
+            /* Color */
+            .location = 2,
+            .binding = 0,
+            .format = VK_FORMAT_R32G32B32A32_SFLOAT,
+            .offset = offsetof(struct color_clear_vattrs, color),
+         },
+      },
+   };
+
+   const VkPipelineDepthStencilStateCreateInfo ds_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+      .depthTestEnable = false,
+      .depthWriteEnable = false,
+      .depthBoundsTestEnable = false,
+      .stencilTestEnable = false,
+   };
+
+   const VkPipelineColorBlendStateCreateInfo cb_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+      .logicOpEnable = false,
+      .attachmentCount = 1,
+      .pAttachments = (VkPipelineColorBlendAttachmentState []) {
+         {
+            .blendEnable = false,
+            .colorWriteMask = VK_COLOR_COMPONENT_A_BIT |
+                              VK_COLOR_COMPONENT_R_BIT |
+                              VK_COLOR_COMPONENT_G_BIT |
+                              VK_COLOR_COMPONENT_B_BIT,
+         },
+      },
+   };
+
+   /* Disable repclear because we do not want the compiler to replace the
+    * shader. We need the shader to write to the specified color attachment,
+    * but the repclear shader writes to all color attachments.
+    */
+   return
+      create_pipeline(device, vs_nir, fs_nir, &vi_state, &ds_state,
+                      &cb_state, &device->meta_state.alloc,
+                      /*use_repclear*/ false, pipeline);
+}
+
+static void
+free_color_pipelines(struct anv_device *device)
+{
+   for (uint32_t i = 0;
+        i < ARRAY_SIZE(device->meta_state.clear.color_pipelines); ++i) {
+      if (device->meta_state.clear.color_pipelines[i] == NULL)
+         continue;
+
+      ANV_CALL(DestroyPipeline)(
+         anv_device_to_handle(device),
+         anv_pipeline_to_handle(device->meta_state.clear.color_pipelines[i]),
+         &device->meta_state.alloc);
+   }
+}
+
+static VkResult
+init_color_pipelines(struct anv_device *device)
+{
+   VkResult result;
+   struct anv_pipeline **pipelines = device->meta_state.clear.color_pipelines;
+   uint32_t n = ARRAY_SIZE(device->meta_state.clear.color_pipelines);
+
+   zero(device->meta_state.clear.color_pipelines);
+
+   for (uint32_t i = 0; i < n; ++i) {
+      result = create_color_pipeline(device, i, &pipelines[i]);
+      if (result < 0)
+         goto fail;
+   }
+
+   return VK_SUCCESS;
+
+fail:
+   free_color_pipelines(device);
+
+   return result;
+}
+
+static void
+emit_color_clear(struct anv_cmd_buffer *cmd_buffer,
+                 const VkClearAttachment *clear_att,
+                 const VkClearRect *clear_rect)
+{
+   struct anv_device *device = cmd_buffer->device;
+   const struct anv_subpass *subpass = cmd_buffer->state.subpass;
+   const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+   VkClearColorValue clear_value = clear_att->clearValue.color;
+   struct anv_pipeline *pipeline =
+      device->meta_state.clear.color_pipelines[clear_att->colorAttachment];
+
+   VkCommandBuffer cmd_buffer_h = anv_cmd_buffer_to_handle(cmd_buffer);
+   VkPipeline pipeline_h = anv_pipeline_to_handle(pipeline);
+
+   assert(clear_att->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(clear_att->colorAttachment < subpass->color_count);
+
+   const struct color_clear_vattrs vertex_data[3] = {
+      {
+         .vue_header = { 0 },
+         .position = {
+            clear_rect->rect.offset.x,
+            clear_rect->rect.offset.y,
+         },
+         .color = clear_value,
+      },
+      {
+         .vue_header = { 0 },
+         .position = {
+            clear_rect->rect.offset.x + clear_rect->rect.extent.width,
+            clear_rect->rect.offset.y,
+         },
+         .color = clear_value,
+      },
+      {
+         .vue_header = { 0 },
+         .position = {
+            clear_rect->rect.offset.x + clear_rect->rect.extent.width,
+            clear_rect->rect.offset.y + clear_rect->rect.extent.height,
+         },
+         .color = clear_value,
+      },
+   };
+
+   struct anv_state state =
+      anv_cmd_buffer_emit_dynamic(cmd_buffer, vertex_data, sizeof(vertex_data), 16);
+
+   struct anv_buffer vertex_buffer = {
+      .device = device,
+      .size = sizeof(vertex_data),
+      .bo = &device->dynamic_state_block_pool.bo,
+      .offset = state.offset,
+   };
+
+   ANV_CALL(CmdSetViewport)(cmd_buffer_h, 0, 1,
+      (VkViewport[]) {
+         {
+            .x = 0,
+            .y = 0,
+            .width = fb->width,
+            .height = fb->height,
+            .minDepth = 0.0,
+            .maxDepth = 1.0,
+         },
+      });
+
+   ANV_CALL(CmdSetScissor)(cmd_buffer_h, 0, 1,
+      (VkRect2D[]) {
+         {
+            .offset = { 0, 0 },
+            .extent = { fb->width, fb->height },
+         }
+      });
+
+   ANV_CALL(CmdBindVertexBuffers)(cmd_buffer_h, 0, 1,
+      (VkBuffer[]) { anv_buffer_to_handle(&vertex_buffer) },
+      (VkDeviceSize[]) { 0 });
+
+   if (cmd_buffer->state.pipeline != pipeline) {
+      ANV_CALL(CmdBindPipeline)(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                pipeline_h);
+   }
+
+   ANV_CALL(CmdDraw)(cmd_buffer_h, 3, 1, 0, 0);
+}
+
+
+static void
+build_depthstencil_shaders(struct nir_shader **out_vs,
+                           struct nir_shader **out_fs)
+{
+   nir_builder vs_b;
+   nir_builder fs_b;
+
+   nir_builder_init_simple_shader(&vs_b, NULL, MESA_SHADER_VERTEX, NULL);
+   nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL);
+
+   vs_b.shader->info.name = ralloc_strdup(vs_b.shader, "meta_clear_depthstencil_vs");
+   fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "meta_clear_depthstencil_fs");
+
+   const struct glsl_type *position_type = glsl_vec4_type();
+
+   nir_variable *vs_in_pos =
+      nir_variable_create(vs_b.shader, nir_var_shader_in, position_type,
+                          "a_position");
+   vs_in_pos->data.location = VERT_ATTRIB_GENERIC0;
+
+   nir_variable *vs_out_pos =
+      nir_variable_create(vs_b.shader, nir_var_shader_out, position_type,
+                          "gl_Position");
+   vs_out_pos->data.location = VARYING_SLOT_POS;
+
+   nir_copy_var(&vs_b, vs_out_pos, vs_in_pos);
+
+   *out_vs = vs_b.shader;
+   *out_fs = fs_b.shader;
+}
+
+static VkResult
+create_depthstencil_pipeline(struct anv_device *device,
+                             VkImageAspectFlags aspects,
+                             struct anv_pipeline **pipeline)
+{
+   struct nir_shader *vs_nir;
+   struct nir_shader *fs_nir;
+
+   build_depthstencil_shaders(&vs_nir, &fs_nir);
+
+   const VkPipelineVertexInputStateCreateInfo vi_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+      .vertexBindingDescriptionCount = 1,
+      .pVertexBindingDescriptions = (VkVertexInputBindingDescription[]) {
+         {
+            .binding = 0,
+            .stride = sizeof(struct depthstencil_clear_vattrs),
+            .inputRate = VK_VERTEX_INPUT_RATE_VERTEX
+         },
+      },
+      .vertexAttributeDescriptionCount = 2,
+      .pVertexAttributeDescriptions = (VkVertexInputAttributeDescription[]) {
+         {
+            /* VUE Header */
+            .location = 0,
+            .binding = 0,
+            .format = VK_FORMAT_R32G32B32A32_UINT,
+            .offset = offsetof(struct depthstencil_clear_vattrs, vue_header),
+         },
+         {
+            /* Position */
+            .location = 1,
+            .binding = 0,
+            .format = VK_FORMAT_R32G32_SFLOAT,
+            .offset = offsetof(struct depthstencil_clear_vattrs, position),
+         },
+      },
+   };
+
+   const VkPipelineDepthStencilStateCreateInfo ds_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+      .depthTestEnable = (aspects & VK_IMAGE_ASPECT_DEPTH_BIT),
+      .depthCompareOp = VK_COMPARE_OP_ALWAYS,
+      .depthWriteEnable = (aspects & VK_IMAGE_ASPECT_DEPTH_BIT),
+      .depthBoundsTestEnable = false,
+      .stencilTestEnable = (aspects & VK_IMAGE_ASPECT_STENCIL_BIT),
+      .front = {
+         .passOp = VK_STENCIL_OP_REPLACE,
+         .compareOp = VK_COMPARE_OP_ALWAYS,
+         .writeMask = UINT32_MAX,
+         .reference = 0, /* dynamic */
+      },
+      .back = { 0 /* dont care */ },
+   };
+
+   const VkPipelineColorBlendStateCreateInfo cb_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+      .logicOpEnable = false,
+      .attachmentCount = 0,
+      .pAttachments = NULL,
+   };
+
+   return create_pipeline(device, vs_nir, fs_nir, &vi_state, &ds_state,
+                          &cb_state, &device->meta_state.alloc,
+                          /*use_repclear*/ true, pipeline);
+}
+
+static void
+emit_depthstencil_clear(struct anv_cmd_buffer *cmd_buffer,
+                        const VkClearAttachment *clear_att,
+                        const VkClearRect *clear_rect)
+{
+   struct anv_device *device = cmd_buffer->device;
+   const struct anv_subpass *subpass = cmd_buffer->state.subpass;
+   const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+   uint32_t attachment = subpass->depth_stencil_attachment;
+   VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil;
+   VkImageAspectFlags aspects = clear_att->aspectMask;
+
+   VkCommandBuffer cmd_buffer_h = anv_cmd_buffer_to_handle(cmd_buffer);
+
+   assert(aspects == VK_IMAGE_ASPECT_DEPTH_BIT ||
+          aspects == VK_IMAGE_ASPECT_STENCIL_BIT ||
+          aspects == (VK_IMAGE_ASPECT_DEPTH_BIT |
+                      VK_IMAGE_ASPECT_STENCIL_BIT));
+   assert(attachment != VK_ATTACHMENT_UNUSED);
+
+   const struct depthstencil_clear_vattrs vertex_data[3] = {
+      {
+         .vue_header = { 0 },
+         .position = {
+            clear_rect->rect.offset.x,
+            clear_rect->rect.offset.y,
+         },
+      },
+      {
+         .vue_header = { 0 },
+         .position = {
+            clear_rect->rect.offset.x + clear_rect->rect.extent.width,
+            clear_rect->rect.offset.y,
+         },
+      },
+      {
+         .vue_header = { 0 },
+         .position = {
+            clear_rect->rect.offset.x + clear_rect->rect.extent.width,
+            clear_rect->rect.offset.y + clear_rect->rect.extent.height,
+         },
+      },
+   };
+
+   struct anv_state state =
+      anv_cmd_buffer_emit_dynamic(cmd_buffer, vertex_data, sizeof(vertex_data), 16);
+
+   struct anv_buffer vertex_buffer = {
+      .device = device,
+      .size = sizeof(vertex_data),
+      .bo = &device->dynamic_state_block_pool.bo,
+      .offset = state.offset,
+   };
+
+   ANV_CALL(CmdSetViewport)(cmd_buffer_h, 0, 1,
+      (VkViewport[]) {
+         {
+            .x = 0,
+            .y = 0,
+            .width = fb->width,
+            .height = fb->height,
+
+            /* Ignored when clearing only stencil. */
+            .minDepth = clear_value.depth,
+            .maxDepth = clear_value.depth,
+         },
+      });
+
+   ANV_CALL(CmdSetScissor)(cmd_buffer_h, 0, 1,
+      (VkRect2D[]) {
+         {
+            .offset = { 0, 0 },
+            .extent = { fb->width, fb->height },
+         }
+      });
+
+   if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
+      ANV_CALL(CmdSetStencilReference)(cmd_buffer_h, VK_STENCIL_FACE_FRONT_BIT,
+                                       clear_value.stencil);
+   }
+
+   ANV_CALL(CmdBindVertexBuffers)(cmd_buffer_h, 0, 1,
+      (VkBuffer[]) { anv_buffer_to_handle(&vertex_buffer) },
+      (VkDeviceSize[]) { 0 });
+
+   struct anv_pipeline *pipeline;
+   switch (aspects) {
+   case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT:
+      pipeline = device->meta_state.clear.depthstencil_pipeline;
+      break;
+   case VK_IMAGE_ASPECT_DEPTH_BIT:
+      pipeline = device->meta_state.clear.depth_only_pipeline;
+      break;
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      pipeline = device->meta_state.clear.stencil_only_pipeline;
+      break;
+   default:
+      unreachable("expected depth or stencil aspect");
+   }
+
+   if (cmd_buffer->state.pipeline != pipeline) {
+      ANV_CALL(CmdBindPipeline)(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                anv_pipeline_to_handle(pipeline));
+   }
+
+   ANV_CALL(CmdDraw)(cmd_buffer_h, 3, 1, 0, 0);
+}
+
+static VkResult
+init_depthstencil_pipelines(struct anv_device *device)
+{
+   VkResult result;
+   struct anv_meta_state *state = &device->meta_state;
+
+   result =
+      create_depthstencil_pipeline(device, VK_IMAGE_ASPECT_DEPTH_BIT,
+                                   &state->clear.depth_only_pipeline);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   result =
+      create_depthstencil_pipeline(device, VK_IMAGE_ASPECT_STENCIL_BIT,
+                                   &state->clear.stencil_only_pipeline);
+   if (result != VK_SUCCESS)
+      goto fail_depth_only;
+
+   result =
+      create_depthstencil_pipeline(device,
+                                   VK_IMAGE_ASPECT_DEPTH_BIT |
+                                   VK_IMAGE_ASPECT_STENCIL_BIT,
+                                   &state->clear.depthstencil_pipeline);
+   if (result != VK_SUCCESS)
+      goto fail_stencil_only;
+
+   return result;
+
+ fail_stencil_only:
+   anv_DestroyPipeline(anv_device_to_handle(device),
+                       anv_pipeline_to_handle(state->clear.stencil_only_pipeline),
+                       NULL);
+ fail_depth_only:
+   anv_DestroyPipeline(anv_device_to_handle(device),
+                       anv_pipeline_to_handle(state->clear.depth_only_pipeline),
+                       NULL);
+ fail:
+   return result;
+}
+
+VkResult
+anv_device_init_meta_clear_state(struct anv_device *device)
+{
+   VkResult result;
+
+   result = init_color_pipelines(device);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = init_depthstencil_pipelines(device);
+   if (result != VK_SUCCESS) {
+      free_color_pipelines(device);
+      return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+anv_device_finish_meta_clear_state(struct anv_device *device)
+{
+   VkDevice device_h = anv_device_to_handle(device);
+
+   free_color_pipelines(device);
+
+   ANV_CALL(DestroyPipeline)(device_h,
+      anv_pipeline_to_handle(device->meta_state.clear.depth_only_pipeline),
+      NULL);
+   ANV_CALL(DestroyPipeline)(device_h,
+      anv_pipeline_to_handle(device->meta_state.clear.stencil_only_pipeline),
+      NULL);
+   ANV_CALL(DestroyPipeline)(device_h,
+      anv_pipeline_to_handle(device->meta_state.clear.depthstencil_pipeline),
+      NULL);
+}
+
+/**
+ * The parameters mean that same as those in vkCmdClearAttachments.
+ */
+static void
+emit_clear(struct anv_cmd_buffer *cmd_buffer,
+           const VkClearAttachment *clear_att,
+           const VkClearRect *clear_rect)
+{
+   if (clear_att->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
+      emit_color_clear(cmd_buffer, clear_att, clear_rect);
+   } else {
+      assert(clear_att->aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                                      VK_IMAGE_ASPECT_STENCIL_BIT));
+      emit_depthstencil_clear(cmd_buffer, clear_att, clear_rect);
+   }
+}
+
+static bool
+subpass_needs_clear(const struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+   uint32_t ds = cmd_state->subpass->depth_stencil_attachment;
+
+   for (uint32_t i = 0; i < cmd_state->subpass->color_count; ++i) {
+      uint32_t a = cmd_state->subpass->color_attachments[i];
+      if (cmd_state->attachments[a].pending_clear_aspects) {
+         return true;
+      }
+   }
+
+   if (ds != VK_ATTACHMENT_UNUSED &&
+       cmd_state->attachments[ds].pending_clear_aspects) {
+      return true;
+   }
+
+   return false;
+}
+
+/**
+ * Emit any pending attachment clears for the current subpass.
+ *
+ * @see anv_attachment_state::pending_clear_aspects
+ */
+void
+anv_cmd_buffer_clear_subpass(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+   struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+   struct anv_meta_saved_state saved_state;
+
+   if (!subpass_needs_clear(cmd_buffer))
+      return;
+
+   meta_clear_begin(&saved_state, cmd_buffer);
+
+   if (cmd_state->framebuffer->layers > 1)
+      anv_finishme("clearing multi-layer framebuffer");
+
+   VkClearRect clear_rect = {
+      .rect = {
+         .offset = { 0, 0 },
+         .extent = { fb->width, fb->height },
+      },
+      .baseArrayLayer = 0,
+      .layerCount = 1, /* FINISHME: clear multi-layer framebuffer */
+   };
+
+   for (uint32_t i = 0; i < cmd_state->subpass->color_count; ++i) {
+      uint32_t a = cmd_state->subpass->color_attachments[i];
+
+      if (!cmd_state->attachments[a].pending_clear_aspects)
+         continue;
+
+      assert(cmd_state->attachments[a].pending_clear_aspects ==
+             VK_IMAGE_ASPECT_COLOR_BIT);
+
+      VkClearAttachment clear_att = {
+         .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+         .colorAttachment = i, /* Use attachment index relative to subpass */
+         .clearValue = cmd_state->attachments[a].clear_value,
+      };
+
+      emit_clear(cmd_buffer, &clear_att, &clear_rect);
+      cmd_state->attachments[a].pending_clear_aspects = 0;
+   }
+
+   uint32_t ds = cmd_state->subpass->depth_stencil_attachment;
+
+   if (ds != VK_ATTACHMENT_UNUSED &&
+       cmd_state->attachments[ds].pending_clear_aspects) {
+
+      VkClearAttachment clear_att = {
+         .aspectMask = cmd_state->attachments[ds].pending_clear_aspects,
+         .clearValue = cmd_state->attachments[ds].clear_value,
+      };
+
+      emit_clear(cmd_buffer, &clear_att, &clear_rect);
+      cmd_state->attachments[ds].pending_clear_aspects = 0;
+   }
+
+   meta_clear_end(&saved_state, cmd_buffer);
+}
+
+static void
+anv_cmd_clear_image(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_image *image,
+                    VkImageLayout image_layout,
+                    const VkClearValue *clear_value,
+                    uint32_t range_count,
+                    const VkImageSubresourceRange *ranges)
+{
+   VkDevice device_h = anv_device_to_handle(cmd_buffer->device);
+
+   for (uint32_t r = 0; r < range_count; r++) {
+      const VkImageSubresourceRange *range = &ranges[r];
+
+      for (uint32_t l = 0; l < range->levelCount; ++l) {
+         for (uint32_t s = 0; s < range->layerCount; ++s) {
+            struct anv_image_view iview;
+            anv_image_view_init(&iview, cmd_buffer->device,
+               &(VkImageViewCreateInfo) {
+                  .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+                  .image = anv_image_to_handle(image),
+                  .viewType = anv_meta_get_view_type(image),
+                  .format = image->vk_format,
+                  .subresourceRange = {
+                     .aspectMask = range->aspectMask,
+                     .baseMipLevel = range->baseMipLevel + l,
+                     .levelCount = 1,
+                     .baseArrayLayer = range->baseArrayLayer + s,
+                     .layerCount = 1
+                  },
+               },
+               cmd_buffer);
+
+            VkFramebuffer fb;
+            anv_CreateFramebuffer(device_h,
+               &(VkFramebufferCreateInfo) {
+                  .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+                  .attachmentCount = 1,
+                  .pAttachments = (VkImageView[]) {
+                     anv_image_view_to_handle(&iview),
+                  },
+                  .width = iview.extent.width,
+                  .height = iview.extent.height,
+                  .layers = 1
+               },
+               &cmd_buffer->pool->alloc,
+               &fb);
+
+            VkAttachmentDescription att_desc = {
+               .format = iview.vk_format,
+               .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+               .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+               .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+               .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE,
+               .initialLayout = image_layout,
+               .finalLayout = image_layout,
+            };
+
+            VkSubpassDescription subpass_desc = {
+               .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+               .inputAttachmentCount = 0,
+               .colorAttachmentCount = 0,
+               .pColorAttachments = NULL,
+               .pResolveAttachments = NULL,
+               .pDepthStencilAttachment = NULL,
+               .preserveAttachmentCount = 0,
+               .pPreserveAttachments = NULL,
+            };
+
+            const VkAttachmentReference att_ref = {
+               .attachment = 0,
+               .layout = image_layout,
+            };
+
+            if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
+               subpass_desc.colorAttachmentCount = 1;
+               subpass_desc.pColorAttachments = &att_ref;
+            } else {
+               subpass_desc.pDepthStencilAttachment = &att_ref;
+            }
+
+            VkRenderPass pass;
+            anv_CreateRenderPass(device_h,
+               &(VkRenderPassCreateInfo) {
+                  .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+                  .attachmentCount = 1,
+                  .pAttachments = &att_desc,
+                  .subpassCount = 1,
+                  .pSubpasses = &subpass_desc,
+               },
+               &cmd_buffer->pool->alloc,
+               &pass);
+
+            ANV_CALL(CmdBeginRenderPass)(anv_cmd_buffer_to_handle(cmd_buffer),
+               &(VkRenderPassBeginInfo) {
+                  .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+                  .renderArea = {
+                     .offset = { 0, 0, },
+                     .extent = {
+                        .width = iview.extent.width,
+                        .height = iview.extent.height,
+                     },
+                  },
+                  .renderPass = pass,
+                  .framebuffer = fb,
+                  .clearValueCount = 0,
+                  .pClearValues = NULL,
+               },
+               VK_SUBPASS_CONTENTS_INLINE);
+
+            VkClearAttachment clear_att = {
+               .aspectMask = range->aspectMask,
+               .colorAttachment = 0,
+               .clearValue = *clear_value,
+            };
+
+            VkClearRect clear_rect = {
+               .rect = {
+                  .offset = { 0, 0 },
+                  .extent = { iview.extent.width, iview.extent.height },
+               },
+               .baseArrayLayer = range->baseArrayLayer,
+               .layerCount = 1, /* FINISHME: clear multi-layer framebuffer */
+            };
+
+            emit_clear(cmd_buffer, &clear_att, &clear_rect);
+
+            ANV_CALL(CmdEndRenderPass)(anv_cmd_buffer_to_handle(cmd_buffer));
+            ANV_CALL(DestroyRenderPass)(device_h, pass,
+                                        &cmd_buffer->pool->alloc);
+            ANV_CALL(DestroyFramebuffer)(device_h, fb,
+                                         &cmd_buffer->pool->alloc);
+         }
+      }
+   }
+}
+
+void anv_CmdClearColorImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     image_h,
+    VkImageLayout                               imageLayout,
+    const VkClearColorValue*                    pColor,
+    uint32_t                                    rangeCount,
+    const VkImageSubresourceRange*              pRanges)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, image, image_h);
+   struct anv_meta_saved_state saved_state;
+
+   meta_clear_begin(&saved_state, cmd_buffer);
+
+   anv_cmd_clear_image(cmd_buffer, image, imageLayout,
+                       (const VkClearValue *) pColor,
+                       rangeCount, pRanges);
+
+   meta_clear_end(&saved_state, cmd_buffer);
+}
+
+void anv_CmdClearDepthStencilImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     image_h,
+    VkImageLayout                               imageLayout,
+    const VkClearDepthStencilValue*             pDepthStencil,
+    uint32_t                                    rangeCount,
+    const VkImageSubresourceRange*              pRanges)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, image, image_h);
+   struct anv_meta_saved_state saved_state;
+
+   meta_clear_begin(&saved_state, cmd_buffer);
+
+   anv_cmd_clear_image(cmd_buffer, image, imageLayout,
+                       (const VkClearValue *) pDepthStencil,
+                       rangeCount, pRanges);
+
+   meta_clear_end(&saved_state, cmd_buffer);
+}
+
+void anv_CmdClearAttachments(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    attachmentCount,
+    const VkClearAttachment*                    pAttachments,
+    uint32_t                                    rectCount,
+    const VkClearRect*                          pRects)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_meta_saved_state saved_state;
+
+   meta_clear_begin(&saved_state, cmd_buffer);
+
+   /* FINISHME: We can do better than this dumb loop. It thrashes too much
+    * state.
+    */
+   for (uint32_t a = 0; a < attachmentCount; ++a) {
+      for (uint32_t r = 0; r < rectCount; ++r) {
+         emit_clear(cmd_buffer, &pAttachments[a], &pRects[r]);
+      }
+   }
+
+   meta_clear_end(&saved_state, cmd_buffer);
+}
+
+static void
+do_buffer_fill(struct anv_cmd_buffer *cmd_buffer,
+               struct anv_bo *dest, uint64_t dest_offset,
+               int width, int height, VkFormat fill_format, uint32_t data)
+{
+   VkDevice vk_device = anv_device_to_handle(cmd_buffer->device);
+
+   VkImageCreateInfo image_info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+      .imageType = VK_IMAGE_TYPE_2D,
+      .format = fill_format,
+      .extent = {
+         .width = width,
+         .height = height,
+         .depth = 1,
+      },
+      .mipLevels = 1,
+      .arrayLayers = 1,
+      .samples = 1,
+      .tiling = VK_IMAGE_TILING_LINEAR,
+      .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+      .flags = 0,
+   };
+
+   VkImage dest_image;
+   image_info.usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+   anv_CreateImage(vk_device, &image_info,
+                   &cmd_buffer->pool->alloc, &dest_image);
+
+   /* We could use a vk call to bind memory, but that would require
+    * creating a dummy memory object etc. so there's really no point.
+    */
+   anv_image_from_handle(dest_image)->bo = dest;
+   anv_image_from_handle(dest_image)->offset = dest_offset;
+
+   const VkClearValue clear_value = {
+      .color = {
+         .uint32 = { data, data, data, data }
+      }
+   };
+
+   const VkImageSubresourceRange range = {
+      .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+      .baseMipLevel = 0,
+      .levelCount = 1,
+      .baseArrayLayer = 0,
+      .layerCount = 1,
+   };
+
+   anv_cmd_clear_image(cmd_buffer, anv_image_from_handle(dest_image),
+                       VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                       &clear_value, 1, &range);
+}
+
+void anv_CmdFillBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                fillSize,
+    uint32_t                                    data)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
+   struct anv_meta_saved_state saved_state;
+
+   meta_clear_begin(&saved_state, cmd_buffer);
+
+   VkFormat format;
+   int bs;
+   if ((fillSize & 15) == 0 && (dstOffset & 15) == 0) {
+      format = VK_FORMAT_R32G32B32A32_UINT;
+      bs = 16;
+   } else if ((fillSize & 7) == 0 && (dstOffset & 15) == 0) {
+      format = VK_FORMAT_R32G32_UINT;
+      bs = 8;
+   } else {
+      assert((fillSize & 3) == 0 && (dstOffset & 3) == 0);
+      format = VK_FORMAT_R32_UINT;
+      bs = 4;
+   }
+
+   /* This is maximum possible width/height our HW can handle */
+   const uint64_t max_surface_dim = 1 << 14;
+
+   /* First, we make a bunch of max-sized copies */
+   const uint64_t max_fill_size = max_surface_dim * max_surface_dim * bs;
+   while (fillSize > max_fill_size) {
+      do_buffer_fill(cmd_buffer, dst_buffer->bo,
+                     dst_buffer->offset + dstOffset,
+                     max_surface_dim, max_surface_dim, format, data);
+      fillSize -= max_fill_size;
+      dstOffset += max_fill_size;
+   }
+
+   uint64_t height = fillSize / (max_surface_dim * bs);
+   assert(height < max_surface_dim);
+   if (height != 0) {
+      const uint64_t rect_fill_size = height * max_surface_dim * bs;
+      do_buffer_fill(cmd_buffer, dst_buffer->bo,
+                     dst_buffer->offset + dstOffset,
+                     max_surface_dim, height, format, data);
+      fillSize -= rect_fill_size;
+      dstOffset += rect_fill_size;
+   }
+
+   if (fillSize != 0) {
+      do_buffer_fill(cmd_buffer, dst_buffer->bo,
+                     dst_buffer->offset + dstOffset,
+                     fillSize / bs, 1, format, data);
+   }
+
+   meta_clear_end(&saved_state, cmd_buffer);
+}
diff --git a/src/vulkan/anv_meta_clear.h b/src/vulkan/anv_meta_clear.h

new file mode 100644 (file)

index 0000000..853d9f2
--- /dev/null
+++ b/src/vulkan/anv_meta_clear.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct anv_device;
+
+VkResult anv_device_init_meta_clear_state(struct anv_device *device);
+void anv_device_finish_meta_clear_state(struct anv_device *device);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/vulkan/anv_nir.h b/src/vulkan/anv_nir.h

new file mode 100644 (file)

index 0000000..9a7a76f
--- /dev/null
+++ b/src/vulkan/anv_nir.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "glsl/nir/nir.h"
+#include "anv_private.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void anv_nir_lower_push_constants(nir_shader *shader, bool is_scalar);
+
+void anv_nir_apply_dynamic_offsets(struct anv_pipeline *pipeline,
+                                   nir_shader *shader,
+                                   struct brw_stage_prog_data *prog_data);
+bool anv_nir_apply_pipeline_layout(nir_shader *shader,
+                                   struct brw_stage_prog_data *prog_data,
+                                   const struct anv_pipeline_layout *layout);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/vulkan/anv_nir_apply_dynamic_offsets.c b/src/vulkan/anv_nir_apply_dynamic_offsets.c

new file mode 100644 (file)

index 0000000..6837a80
--- /dev/null
+++ b/src/vulkan/anv_nir_apply_dynamic_offsets.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "glsl/nir/nir_builder.h"
+
+struct apply_dynamic_offsets_state {
+   nir_shader *shader;
+   nir_builder builder;
+
+   struct anv_pipeline_layout *layout;
+
+   uint32_t indices_start;
+};
+
+static bool
+apply_dynamic_offsets_block(nir_block *block, void *void_state)
+{
+   struct apply_dynamic_offsets_state *state = void_state;
+   struct anv_descriptor_set_layout *set_layout;
+
+   nir_builder *b = &state->builder;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      unsigned block_idx_src;
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_ubo:
+      case nir_intrinsic_load_ssbo:
+         block_idx_src = 0;
+         break;
+      case nir_intrinsic_store_ssbo:
+         block_idx_src = 1;
+         break;
+      default:
+         continue; /* the loop */
+      }
+
+      nir_instr *res_instr = intrin->src[block_idx_src].ssa->parent_instr;
+      assert(res_instr->type == nir_instr_type_intrinsic);
+      nir_intrinsic_instr *res_intrin = nir_instr_as_intrinsic(res_instr);
+      assert(res_intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
+
+      unsigned set = res_intrin->const_index[0];
+      unsigned binding = res_intrin->const_index[1];
+
+      set_layout = state->layout->set[set].layout;
+      if (set_layout->binding[binding].dynamic_offset_index < 0)
+         continue;
+
+      b->cursor = nir_before_instr(&intrin->instr);
+
+      /* First, we need to generate the uniform load for the buffer offset */
+      uint32_t index = state->layout->set[set].dynamic_offset_start +
+                       set_layout->binding[binding].dynamic_offset_index;
+
+      nir_intrinsic_instr *offset_load =
+         nir_intrinsic_instr_create(state->shader, nir_intrinsic_load_uniform);
+      offset_load->num_components = 2;
+      offset_load->const_index[0] = state->indices_start + index * 8;
+      offset_load->src[0] = nir_src_for_ssa(nir_imul(b, res_intrin->src[0].ssa,
+                                                     nir_imm_int(b, 8)));
+
+      nir_ssa_dest_init(&offset_load->instr, &offset_load->dest, 2, NULL);
+      nir_builder_instr_insert(b, &offset_load->instr);
+
+      nir_src *offset_src = nir_get_io_offset_src(intrin);
+      nir_ssa_def *new_offset = nir_iadd(b, offset_src->ssa,
+                                         &offset_load->dest.ssa);
+
+      /* In order to avoid out-of-bounds access, we predicate */
+      nir_ssa_def *pred = nir_uge(b, nir_channel(b, &offset_load->dest.ssa, 1),
+                                  offset_src->ssa);
+      nir_if *if_stmt = nir_if_create(b->shader);
+      if_stmt->condition = nir_src_for_ssa(pred);
+      nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
+
+      nir_instr_remove(&intrin->instr);
+      *offset_src = nir_src_for_ssa(new_offset);
+      nir_instr_insert_after_cf_list(&if_stmt->then_list, &intrin->instr);
+
+      if (intrin->intrinsic != nir_intrinsic_store_ssbo) {
+         /* It's a load, we need a phi node */
+         nir_phi_instr *phi = nir_phi_instr_create(b->shader);
+         nir_ssa_dest_init(&phi->instr, &phi->dest,
+                           intrin->num_components, NULL);
+
+         nir_phi_src *src1 = ralloc(phi, nir_phi_src);
+         struct exec_node *tnode = exec_list_get_tail(&if_stmt->then_list);
+         src1->pred = exec_node_data(nir_block, tnode, cf_node.node);
+         src1->src = nir_src_for_ssa(&intrin->dest.ssa);
+         exec_list_push_tail(&phi->srcs, &src1->node);
+
+         b->cursor = nir_after_cf_list(&if_stmt->else_list);
+         nir_ssa_def *zero = nir_build_imm(b, intrin->num_components,
+            (nir_const_value) { .u = { 0, 0, 0, 0 } });
+
+         nir_phi_src *src2 = ralloc(phi, nir_phi_src);
+         struct exec_node *enode = exec_list_get_tail(&if_stmt->else_list);
+         src2->pred = exec_node_data(nir_block, enode, cf_node.node);
+         src2->src = nir_src_for_ssa(zero);
+         exec_list_push_tail(&phi->srcs, &src2->node);
+
+         assert(intrin->dest.is_ssa);
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                  nir_src_for_ssa(&phi->dest.ssa));
+
+         nir_instr_insert_after_cf(&if_stmt->cf_node, &phi->instr);
+      }
+   }
+
+   return true;
+}
+
+void
+anv_nir_apply_dynamic_offsets(struct anv_pipeline *pipeline,
+                              nir_shader *shader,
+                              struct brw_stage_prog_data *prog_data)
+{
+   struct apply_dynamic_offsets_state state = {
+      .shader = shader,
+      .layout = pipeline->layout,
+      .indices_start = shader->num_uniforms,
+   };
+
+   if (!state.layout || !state.layout->stage[shader->stage].has_dynamic_offsets)
+      return;
+
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         nir_builder_init(&state.builder, function->impl);
+         nir_foreach_block(function->impl, apply_dynamic_offsets_block, &state);
+         nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                               nir_metadata_dominance);
+      }
+   }
+
+   struct anv_push_constants *null_data = NULL;
+   for (unsigned i = 0; i < MAX_DYNAMIC_BUFFERS; i++) {
+      prog_data->param[i * 2 + shader->num_uniforms] =
+         (const union gl_constant_value *)&null_data->dynamic[i].offset;
+      prog_data->param[i * 2 + 1 + shader->num_uniforms] =
+         (const union gl_constant_value *)&null_data->dynamic[i].range;
+   }
+
+   shader->num_uniforms += MAX_DYNAMIC_BUFFERS * 8;
+}
diff --git a/src/vulkan/anv_nir_apply_pipeline_layout.c b/src/vulkan/anv_nir_apply_pipeline_layout.c

new file mode 100644 (file)

index 0000000..b7b8bd1
--- /dev/null
+++ b/src/vulkan/anv_nir_apply_pipeline_layout.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "program/prog_parameter.h"
+#include "glsl/nir/nir_builder.h"
+
+struct apply_pipeline_layout_state {
+   nir_shader *shader;
+   nir_builder builder;
+
+   const struct anv_pipeline_layout *layout;
+
+   bool progress;
+};
+
+static uint32_t
+get_surface_index(unsigned set, unsigned binding,
+                  struct apply_pipeline_layout_state *state)
+{
+   assert(set < state->layout->num_sets);
+   struct anv_descriptor_set_layout *set_layout =
+      state->layout->set[set].layout;
+
+   gl_shader_stage stage = state->shader->stage;
+
+   assert(binding < set_layout->binding_count);
+
+   assert(set_layout->binding[binding].stage[stage].surface_index >= 0);
+
+   uint32_t surface_index =
+      state->layout->set[set].stage[stage].surface_start +
+      set_layout->binding[binding].stage[stage].surface_index;
+
+   assert(surface_index < state->layout->stage[stage].surface_count);
+
+   return surface_index;
+}
+
+static uint32_t
+get_sampler_index(unsigned set, unsigned binding, nir_texop tex_op,
+                  struct apply_pipeline_layout_state *state)
+{
+   assert(set < state->layout->num_sets);
+   struct anv_descriptor_set_layout *set_layout =
+      state->layout->set[set].layout;
+
+   assert(binding < set_layout->binding_count);
+
+   gl_shader_stage stage = state->shader->stage;
+
+   if (set_layout->binding[binding].stage[stage].sampler_index < 0) {
+      assert(tex_op == nir_texop_txf);
+      return 0;
+   }
+
+   uint32_t sampler_index =
+      state->layout->set[set].stage[stage].sampler_start +
+      set_layout->binding[binding].stage[stage].sampler_index;
+
+   assert(sampler_index < state->layout->stage[stage].sampler_count);
+
+   return sampler_index;
+}
+
+static uint32_t
+get_image_index(unsigned set, unsigned binding,
+                struct apply_pipeline_layout_state *state)
+{
+   assert(set < state->layout->num_sets);
+   struct anv_descriptor_set_layout *set_layout =
+      state->layout->set[set].layout;
+
+   assert(binding < set_layout->binding_count);
+
+   gl_shader_stage stage = state->shader->stage;
+
+   assert(set_layout->binding[binding].stage[stage].image_index >= 0);
+
+   uint32_t image_index =
+      state->layout->set[set].stage[stage].image_start +
+      set_layout->binding[binding].stage[stage].image_index;
+
+   assert(image_index < state->layout->stage[stage].image_count);
+
+   return image_index;
+}
+
+static void
+lower_res_index_intrinsic(nir_intrinsic_instr *intrin,
+                          struct apply_pipeline_layout_state *state)
+{
+   nir_builder *b = &state->builder;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   uint32_t set = intrin->const_index[0];
+   uint32_t binding = intrin->const_index[1];
+
+   uint32_t surface_index = get_surface_index(set, binding, state);
+
+   nir_const_value *const_block_idx =
+      nir_src_as_const_value(intrin->src[0]);
+
+   nir_ssa_def *block_index;
+   if (const_block_idx) {
+      block_index = nir_imm_int(b, surface_index + const_block_idx->u[0]);
+   } else {
+      block_index = nir_iadd(b, nir_imm_int(b, surface_index),
+                             nir_ssa_for_src(b, intrin->src[0], 1));
+   }
+
+   assert(intrin->dest.is_ssa);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(block_index));
+   nir_instr_remove(&intrin->instr);
+}
+
+static void
+lower_tex_deref(nir_tex_instr *tex, nir_deref_var *deref,
+                unsigned *const_index, nir_tex_src_type src_type,
+                struct apply_pipeline_layout_state *state)
+{
+   if (deref->deref.child) {
+      assert(deref->deref.child->deref_type == nir_deref_type_array);
+      nir_deref_array *deref_array = nir_deref_as_array(deref->deref.child);
+
+      *const_index += deref_array->base_offset;
+
+      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+         nir_tex_src *new_srcs = rzalloc_array(tex, nir_tex_src,
+                                               tex->num_srcs + 1);
+
+         for (unsigned i = 0; i < tex->num_srcs; i++) {
+            new_srcs[i].src_type = tex->src[i].src_type;
+            nir_instr_move_src(&tex->instr, &new_srcs[i].src, &tex->src[i].src);
+         }
+
+         ralloc_free(tex->src);
+         tex->src = new_srcs;
+
+         /* Now we can go ahead and move the source over to being a
+          * first-class texture source.
+          */
+         tex->src[tex->num_srcs].src_type = src_type;
+         tex->num_srcs++;
+         assert(deref_array->indirect.is_ssa);
+         nir_instr_rewrite_src(&tex->instr, &tex->src[tex->num_srcs - 1].src,
+                               deref_array->indirect);
+      }
+   }
+}
+
+static void
+cleanup_tex_deref(nir_tex_instr *tex, nir_deref_var *deref)
+{
+   if (deref->deref.child == NULL)
+      return;
+
+   nir_deref_array *deref_array = nir_deref_as_array(deref->deref.child);
+
+   if (deref_array->deref_array_type != nir_deref_array_type_indirect)
+      return;
+
+   nir_instr_rewrite_src(&tex->instr, &deref_array->indirect, NIR_SRC_INIT);
+}
+
+static void
+lower_tex(nir_tex_instr *tex, struct apply_pipeline_layout_state *state)
+{
+   /* No one should have come by and lowered it already */
+   assert(tex->sampler);
+
+   nir_deref_var *tex_deref = tex->texture ? tex->texture : tex->sampler;
+   tex->texture_index =
+      get_surface_index(tex_deref->var->data.descriptor_set,
+                        tex_deref->var->data.binding, state);
+   lower_tex_deref(tex, tex_deref, &tex->texture_index,
+                   nir_tex_src_texture_offset, state);
+
+   tex->sampler_index =
+      get_sampler_index(tex->sampler->var->data.descriptor_set,
+                        tex->sampler->var->data.binding, tex->op, state);
+   lower_tex_deref(tex, tex->sampler, &tex->sampler_index,
+                   nir_tex_src_sampler_offset, state);
+
+   /* The backend only ever uses this to mark used surfaces.  We don't care
+    * about that little optimization so it just needs to be non-zero.
+    */
+   tex->texture_array_size = 1;
+
+   if (tex->texture)
+      cleanup_tex_deref(tex, tex->texture);
+   cleanup_tex_deref(tex, tex->sampler);
+   tex->texture = NULL;
+   tex->sampler = NULL;
+}
+
+static bool
+apply_pipeline_layout_block(nir_block *block, void *void_state)
+{
+   struct apply_pipeline_layout_state *state = void_state;
+
+   nir_foreach_instr_safe(block, instr) {
+      switch (instr->type) {
+      case nir_instr_type_intrinsic: {
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
+            lower_res_index_intrinsic(intrin, state);
+            state->progress = true;
+         }
+         break;
+      }
+      case nir_instr_type_tex:
+         lower_tex(nir_instr_as_tex(instr), state);
+         /* All texture instructions need lowering */
+         state->progress = true;
+         break;
+      default:
+         continue;
+      }
+   }
+
+   return true;
+}
+
+static void
+setup_vec4_uniform_value(const union gl_constant_value **params,
+                         const union gl_constant_value *values,
+                         unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      params[i] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      params[i] = &zero;
+}
+
+bool
+anv_nir_apply_pipeline_layout(nir_shader *shader,
+                              struct brw_stage_prog_data *prog_data,
+                              const struct anv_pipeline_layout *layout)
+{
+   struct apply_pipeline_layout_state state = {
+      .shader = shader,
+      .layout = layout,
+   };
+
+   nir_foreach_function(shader, function) {
+      if (function->impl) {
+         nir_builder_init(&state.builder, function->impl);
+         nir_foreach_block(function->impl, apply_pipeline_layout_block, &state);
+         nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                               nir_metadata_dominance);
+      }
+   }
+
+   if (layout->stage[shader->stage].image_count > 0) {
+      nir_foreach_variable(var, &shader->uniforms) {
+         if (glsl_type_is_image(var->type) ||
+             (glsl_type_is_array(var->type) &&
+              glsl_type_is_image(glsl_get_array_element(var->type)))) {
+            /* Images are represented as uniform push constants and the actual
+             * information required for reading/writing to/from the image is
+             * storred in the uniform.
+             */
+            unsigned image_index = get_image_index(var->data.descriptor_set,
+                                                   var->data.binding, &state);
+
+            var->data.driver_location = shader->num_uniforms +
+                                        image_index * BRW_IMAGE_PARAM_SIZE * 4;
+         }
+      }
+
+      struct anv_push_constants *null_data = NULL;
+      const gl_constant_value **param = prog_data->param + shader->num_uniforms;
+      const struct brw_image_param *image_param = null_data->images;
+      for (uint32_t i = 0; i < layout->stage[shader->stage].image_count; i++) {
+         setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET,
+            (const union gl_constant_value *)&image_param->surface_idx, 1);
+         setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_OFFSET_OFFSET,
+            (const union gl_constant_value *)image_param->offset, 2);
+         setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_SIZE_OFFSET,
+            (const union gl_constant_value *)image_param->size, 3);
+         setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_STRIDE_OFFSET,
+            (const union gl_constant_value *)image_param->stride, 4);
+         setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_TILING_OFFSET,
+            (const union gl_constant_value *)image_param->tiling, 3);
+         setup_vec4_uniform_value(param + BRW_IMAGE_PARAM_SWIZZLING_OFFSET,
+            (const union gl_constant_value *)image_param->swizzling, 2);
+
+         param += BRW_IMAGE_PARAM_SIZE;
+         image_param ++;
+      }
+
+      shader->num_uniforms += layout->stage[shader->stage].image_count *
+                              BRW_IMAGE_PARAM_SIZE * 4;
+   }
+
+   return state.progress;
+}
diff --git a/src/vulkan/anv_nir_lower_push_constants.c b/src/vulkan/anv_nir_lower_push_constants.c

new file mode 100644 (file)

index 0000000..7fc3953
--- /dev/null
+++ b/src/vulkan/anv_nir_lower_push_constants.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+
+struct lower_push_constants_state {
+   nir_shader *shader;
+   bool is_scalar;
+};
+
+static bool
+lower_push_constants_block(nir_block *block, void *void_state)
+{
+   struct lower_push_constants_state *state = void_state;
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      /* TODO: Handle indirect push constants */
+      if (intrin->intrinsic != nir_intrinsic_load_push_constant)
+         continue;
+
+      assert(intrin->const_index[0] % 4 == 0);
+      unsigned dword_offset = intrin->const_index[0] / 4;
+
+      /* We just turn them into uniform loads with the appropreate offset */
+      intrin->intrinsic = nir_intrinsic_load_uniform;
+      intrin->const_index[0] = 0;
+      if (state->is_scalar) {
+         intrin->const_index[1] = dword_offset;
+      } else {
+         unsigned shift = dword_offset % 4;
+         /* Can't cross the vec4 boundary */
+         assert(shift + intrin->num_components <= 4);
+
+         /* vec4 shifts are in units of vec4's */
+         intrin->const_index[1] = dword_offset / 4;
+
+         if (shift) {
+            /* If there's a non-zero shift then we need to load a whole vec4
+             * and use a move to swizzle it into place.
+             */
+            assert(intrin->dest.is_ssa);
+            nir_alu_instr *mov = nir_alu_instr_create(state->shader,
+                                                      nir_op_imov);
+            mov->src[0].src = nir_src_for_ssa(&intrin->dest.ssa);
+            for (unsigned i = 0; i < intrin->num_components; i++)
+               mov->src[0].swizzle[i] = i + shift;
+            mov->dest.write_mask = (1 << intrin->num_components) - 1;
+            nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
+                              intrin->num_components, NULL);
+
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                     nir_src_for_ssa(&mov->dest.dest.ssa));
+            nir_instr_insert_after(&intrin->instr, &mov->instr);
+
+            /* Stomp the number of components to 4 */
+            intrin->num_components = 4;
+            intrin->dest.ssa.num_components = 4;
+         }
+      }
+   }
+
+   return true;
+}
+
+void
+anv_nir_lower_push_constants(nir_shader *shader, bool is_scalar)
+{
+   struct lower_push_constants_state state = {
+      .shader = shader,
+      .is_scalar = is_scalar,
+   };
+
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         nir_foreach_block(function->impl, lower_push_constants_block, &state);
+   }
+
+   assert(shader->num_uniforms % 4 == 0);
+   if (is_scalar)
+      shader->num_uniforms /= 4;
+   else
+      shader->num_uniforms = DIV_ROUND_UP(shader->num_uniforms, 16);
+}
diff --git a/src/vulkan/anv_pass.c b/src/vulkan/anv_pass.c

new file mode 100644 (file)

index 0000000..ccd8ced
--- /dev/null
+++ b/src/vulkan/anv_pass.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+VkResult anv_CreateRenderPass(
+    VkDevice                                    _device,
+    const VkRenderPassCreateInfo*               pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkRenderPass*                               pRenderPass)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_render_pass *pass;
+   size_t size;
+   size_t attachments_offset;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO);
+
+   size = sizeof(*pass);
+   size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]);
+   attachments_offset = size;
+   size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]);
+
+   pass = anv_alloc2(&device->alloc, pAllocator, size, 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pass == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* Clear the subpasses along with the parent pass. This required because
+    * each array member of anv_subpass must be a valid pointer if not NULL.
+    */
+   memset(pass, 0, size);
+   pass->attachment_count = pCreateInfo->attachmentCount;
+   pass->subpass_count = pCreateInfo->subpassCount;
+   pass->attachments = (void *) pass + attachments_offset;
+
+   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+      struct anv_render_pass_attachment *att = &pass->attachments[i];
+
+      att->format = anv_format_for_vk_format(pCreateInfo->pAttachments[i].format);
+      att->samples = pCreateInfo->pAttachments[i].samples;
+      att->load_op = pCreateInfo->pAttachments[i].loadOp;
+      att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp;
+      // att->store_op = pCreateInfo->pAttachments[i].storeOp;
+      // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp;
+   }
+
+   uint32_t subpass_attachment_count = 0, *p;
+   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+      const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+
+      subpass_attachment_count +=
+         desc->inputAttachmentCount +
+         desc->colorAttachmentCount +
+         /* Count colorAttachmentCount again for resolve_attachments */
+         desc->colorAttachmentCount;
+   }
+
+   pass->subpass_attachments =
+      anv_alloc2(&device->alloc, pAllocator,
+                 subpass_attachment_count * sizeof(uint32_t), 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pass->subpass_attachments == NULL) {
+      anv_free2(&device->alloc, pAllocator, pass);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   p = pass->subpass_attachments;
+   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+      const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+      struct anv_subpass *subpass = &pass->subpasses[i];
+
+      subpass->input_count = desc->inputAttachmentCount;
+      subpass->color_count = desc->colorAttachmentCount;
+
+      if (desc->inputAttachmentCount > 0) {
+         subpass->input_attachments = p;
+         p += desc->inputAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
+            subpass->input_attachments[j]
+               = desc->pInputAttachments[j].attachment;
+         }
+      }
+
+      if (desc->colorAttachmentCount > 0) {
+         subpass->color_attachments = p;
+         p += desc->colorAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+            subpass->color_attachments[j]
+               = desc->pColorAttachments[j].attachment;
+         }
+      }
+
+      if (desc->pResolveAttachments) {
+         subpass->resolve_attachments = p;
+         p += desc->colorAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+            subpass->resolve_attachments[j]
+               = desc->pResolveAttachments[j].attachment;
+         }
+      }
+
+      if (desc->pDepthStencilAttachment) {
+         subpass->depth_stencil_attachment =
+            desc->pDepthStencilAttachment->attachment;
+      } else {
+         subpass->depth_stencil_attachment = VK_ATTACHMENT_UNUSED;
+      }
+   }
+
+   *pRenderPass = anv_render_pass_to_handle(pass);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyRenderPass(
+    VkDevice                                    _device,
+    VkRenderPass                                _pass,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_render_pass, pass, _pass);
+
+   anv_free2(&device->alloc, pAllocator, pass->subpass_attachments);
+   anv_free2(&device->alloc, pAllocator, pass);
+}
+
+void anv_GetRenderAreaGranularity(
+    VkDevice                                    device,
+    VkRenderPass                                renderPass,
+    VkExtent2D*                                 pGranularity)
+{
+   *pGranularity = (VkExtent2D) { 1, 1 };
+}
diff --git a/src/vulkan/anv_pipeline.c b/src/vulkan/anv_pipeline.c

new file mode 100644 (file)

index 0000000..d66987f
--- /dev/null
+++ b/src/vulkan/anv_pipeline.c
@@ -0,0 +1,1296 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+#include "brw_nir.h"
+#include "anv_nir.h"
+#include "glsl/nir/spirv/nir_spirv.h"
+
+/* Needed for SWIZZLE macros */
+#include "program/prog_instruction.h"
+
+// Shader functions
+
+VkResult anv_CreateShaderModule(
+    VkDevice                                    _device,
+    const VkShaderModuleCreateInfo*             pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkShaderModule*                             pShaderModule)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_shader_module *module;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO);
+   assert(pCreateInfo->flags == 0);
+
+   module = anv_alloc2(&device->alloc, pAllocator,
+                       sizeof(*module) + pCreateInfo->codeSize, 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (module == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   module->nir = NULL;
+   module->size = pCreateInfo->codeSize;
+   memcpy(module->data, pCreateInfo->pCode, module->size);
+
+   *pShaderModule = anv_shader_module_to_handle(module);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyShaderModule(
+    VkDevice                                    _device,
+    VkShaderModule                              _module,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_shader_module, module, _module);
+
+   anv_free2(&device->alloc, pAllocator, module);
+}
+
+#define SPIR_V_MAGIC_NUMBER 0x07230203
+
+/* Eventually, this will become part of anv_CreateShader.  Unfortunately,
+ * we can't do that yet because we don't have the ability to copy nir.
+ */
+static nir_shader *
+anv_shader_compile_to_nir(struct anv_device *device,
+                          struct anv_shader_module *module,
+                          const char *entrypoint_name,
+                          gl_shader_stage stage,
+                          const VkSpecializationInfo *spec_info)
+{
+   if (strcmp(entrypoint_name, "main") != 0) {
+      anv_finishme("Multiple shaders per module not really supported");
+   }
+
+   const struct brw_compiler *compiler =
+      device->instance->physicalDevice.compiler;
+   const nir_shader_compiler_options *nir_options =
+      compiler->glsl_compiler_options[stage].NirOptions;
+
+   nir_shader *nir;
+   nir_function *entry_point;
+   if (module->nir) {
+      /* Some things such as our meta clear/blit code will give us a NIR
+       * shader directly.  In that case, we just ignore the SPIR-V entirely
+       * and just use the NIR shader */
+      nir = module->nir;
+      nir->options = nir_options;
+      nir_validate_shader(nir);
+
+      assert(exec_list_length(&nir->functions) == 1);
+      struct exec_node *node = exec_list_get_head(&nir->functions);
+      entry_point = exec_node_data(nir_function, node, node);
+   } else {
+      uint32_t *spirv = (uint32_t *) module->data;
+      assert(spirv[0] == SPIR_V_MAGIC_NUMBER);
+      assert(module->size % 4 == 0);
+
+      uint32_t num_spec_entries = 0;
+      struct nir_spirv_specialization *spec_entries = NULL;
+      if (spec_info && spec_info->mapEntryCount > 0) {
+         num_spec_entries = spec_info->mapEntryCount;
+         spec_entries = malloc(num_spec_entries * sizeof(*spec_entries));
+         for (uint32_t i = 0; i < num_spec_entries; i++) {
+            const uint32_t *data =
+               spec_info->pData + spec_info->pMapEntries[i].offset;
+            assert((const void *)(data + 1) <=
+                   spec_info->pData + spec_info->dataSize);
+
+            spec_entries[i].id = spec_info->pMapEntries[i].constantID;
+            spec_entries[i].data = *data;
+         }
+      }
+
+      entry_point = spirv_to_nir(spirv, module->size / 4,
+                                 spec_entries, num_spec_entries,
+                                 stage, entrypoint_name, nir_options);
+      nir = entry_point->shader;
+      assert(nir->stage == stage);
+      nir_validate_shader(nir);
+
+      free(spec_entries);
+
+      nir_lower_returns(nir);
+      nir_validate_shader(nir);
+
+      nir_inline_functions(nir);
+      nir_validate_shader(nir);
+
+      /* Pick off the single entrypoint that we want */
+      foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
+         if (func != entry_point)
+            exec_node_remove(&func->node);
+      }
+      assert(exec_list_length(&nir->functions) == 1);
+      entry_point->name = ralloc_strdup(entry_point, "main");
+
+      nir_remove_dead_variables(nir, nir_var_shader_in);
+      nir_remove_dead_variables(nir, nir_var_shader_out);
+      nir_remove_dead_variables(nir, nir_var_system_value);
+      nir_validate_shader(nir);
+
+      nir_lower_outputs_to_temporaries(entry_point->shader, entry_point);
+
+      nir_lower_system_values(nir);
+      nir_validate_shader(nir);
+   }
+
+   /* Vulkan uses the separate-shader linking model */
+   nir->info.separate_shader = true;
+
+   nir = brw_preprocess_nir(nir, compiler->scalar_stage[stage]);
+
+   nir_shader_gather_info(nir, entry_point->impl);
+
+   uint32_t indirect_mask = (1 << nir_var_shader_in);
+   if (compiler->glsl_compiler_options[stage].EmitNoIndirectTemp)
+      indirect_mask |= 1 << nir_var_local;
+
+   nir_lower_indirect_derefs(nir, indirect_mask);
+
+   return nir;
+}
+
+void
+anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
+                        struct anv_device *device)
+{
+   cache->device = device;
+   anv_state_stream_init(&cache->program_stream,
+                         &device->instruction_block_pool);
+   pthread_mutex_init(&cache->mutex, NULL);
+}
+
+void
+anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
+{
+   anv_state_stream_finish(&cache->program_stream);
+   pthread_mutex_destroy(&cache->mutex);
+}
+
+static uint32_t
+anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
+                                 const void *data, size_t size)
+{
+   pthread_mutex_lock(&cache->mutex);
+
+   struct anv_state state =
+      anv_state_stream_alloc(&cache->program_stream, size, 64);
+
+   pthread_mutex_unlock(&cache->mutex);
+
+   assert(size < cache->program_stream.block_pool->block_size);
+
+   memcpy(state.map, data, size);
+
+   if (!cache->device->info.has_llc)
+      anv_state_clflush(state);
+
+   return state.offset;
+}
+
+VkResult anv_CreatePipelineCache(
+    VkDevice                                    _device,
+    const VkPipelineCacheCreateInfo*            pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipelineCache*                            pPipelineCache)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_pipeline_cache *cache;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
+   assert(pCreateInfo->flags == 0);
+
+   cache = anv_alloc2(&device->alloc, pAllocator,
+                       sizeof(*cache), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (cache == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   anv_pipeline_cache_init(cache, device);
+
+   *pPipelineCache = anv_pipeline_cache_to_handle(cache);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyPipelineCache(
+    VkDevice                                    _device,
+    VkPipelineCache                             _cache,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
+
+   anv_pipeline_cache_finish(cache);
+
+   anv_free2(&device->alloc, pAllocator, cache);
+}
+
+VkResult anv_GetPipelineCacheData(
+    VkDevice                                    device,
+    VkPipelineCache                             pipelineCache,
+    size_t*                                     pDataSize,
+    void*                                       pData)
+{
+   *pDataSize = 0;
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_MergePipelineCaches(
+    VkDevice                                    device,
+    VkPipelineCache                             destCache,
+    uint32_t                                    srcCacheCount,
+    const VkPipelineCache*                      pSrcCaches)
+{
+   stub_return(VK_SUCCESS);
+}
+
+void anv_DestroyPipeline(
+    VkDevice                                    _device,
+    VkPipeline                                  _pipeline,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+
+   anv_reloc_list_finish(&pipeline->batch_relocs,
+                         pAllocator ? pAllocator : &device->alloc);
+   if (pipeline->blend_state.map)
+      anv_state_pool_free(&device->dynamic_state_pool, pipeline->blend_state);
+   anv_free2(&device->alloc, pAllocator, pipeline);
+}
+
+static const uint32_t vk_to_gen_primitive_type[] = {
+   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+/*   [VK_PRIMITIVE_TOPOLOGY_PATCH_LIST]                = _3DPRIM_PATCHLIST_1 */
+};
+
+static void
+populate_sampler_prog_key(const struct brw_device_info *devinfo,
+                          struct brw_sampler_prog_key_data *key)
+{
+   /* XXX: Handle texture swizzle on HSW- */
+   for (int i = 0; i < MAX_SAMPLERS; i++) {
+      /* Assume color sampler, no swizzling. (Works for BDW+) */
+      key->swizzles[i] = SWIZZLE_XYZW;
+   }
+}
+
+static void
+populate_vs_prog_key(const struct brw_device_info *devinfo,
+                     struct brw_vs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+
+   /* XXX: Handle vertex input work-arounds */
+
+   /* XXX: Handle sampler_prog_key */
+}
+
+static void
+populate_gs_prog_key(const struct brw_device_info *devinfo,
+                     struct brw_gs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+}
+
+static void
+populate_wm_prog_key(const struct brw_device_info *devinfo,
+                     const VkGraphicsPipelineCreateInfo *info,
+                     const struct anv_graphics_pipeline_create_info *extra,
+                     struct brw_wm_prog_key *key)
+{
+   ANV_FROM_HANDLE(anv_render_pass, render_pass, info->renderPass);
+
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+
+   /* TODO: Fill out key->input_slots_valid */
+
+   /* Vulkan doesn't specify a default */
+   key->high_quality_derivatives = false;
+
+   /* XXX Vulkan doesn't appear to specify */
+   key->clamp_fragment_color = false;
+
+   /* Vulkan always specifies upper-left coordinates */
+   key->drawable_height = 0;
+   key->render_to_fbo = false;
+
+   if (extra && extra->color_attachment_count >= 0) {
+      key->nr_color_regions = extra->color_attachment_count;
+   } else {
+      key->nr_color_regions =
+         render_pass->subpasses[info->subpass].color_count;
+   }
+
+   key->replicate_alpha = key->nr_color_regions > 1 &&
+                          info->pMultisampleState &&
+                          info->pMultisampleState->alphaToCoverageEnable;
+
+   if (info->pMultisampleState && info->pMultisampleState->rasterizationSamples > 1) {
+      /* We should probably pull this out of the shader, but it's fairly
+       * harmless to compute it and then let dead-code take care of it.
+       */
+      key->persample_shading = info->pMultisampleState->sampleShadingEnable;
+      if (key->persample_shading)
+         key->persample_2x = info->pMultisampleState->rasterizationSamples == 2;
+
+      key->compute_pos_offset = info->pMultisampleState->sampleShadingEnable;
+      key->compute_sample_id = info->pMultisampleState->sampleShadingEnable;
+   }
+}
+
+static void
+populate_cs_prog_key(const struct brw_device_info *devinfo,
+                     struct brw_cs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+}
+
+static nir_shader *
+anv_pipeline_compile(struct anv_pipeline *pipeline,
+                     struct anv_shader_module *module,
+                     const char *entrypoint,
+                     gl_shader_stage stage,
+                     const VkSpecializationInfo *spec_info,
+                     struct brw_stage_prog_data *prog_data)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+
+   nir_shader *nir = anv_shader_compile_to_nir(pipeline->device,
+                                               module, entrypoint, stage,
+                                               spec_info);
+   if (nir == NULL)
+      return NULL;
+
+   anv_nir_lower_push_constants(nir, compiler->scalar_stage[stage]);
+
+   /* Figure out the number of parameters */
+   prog_data->nr_params = 0;
+
+   if (nir->num_uniforms > 0) {
+      /* If the shader uses any push constants at all, we'll just give
+       * them the maximum possible number
+       */
+      prog_data->nr_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float);
+   }
+
+   if (pipeline->layout && pipeline->layout->stage[stage].has_dynamic_offsets)
+      prog_data->nr_params += MAX_DYNAMIC_BUFFERS * 2;
+
+   if (pipeline->layout && pipeline->layout->stage[stage].image_count > 0)
+      prog_data->nr_params += pipeline->layout->stage[stage].image_count *
+                              BRW_IMAGE_PARAM_SIZE;
+
+   if (prog_data->nr_params > 0) {
+      /* XXX: I think we're leaking this */
+      prog_data->param = (const union gl_constant_value **)
+         malloc(prog_data->nr_params * sizeof(union gl_constant_value *));
+
+      /* We now set the param values to be offsets into a
+       * anv_push_constant_data structure.  Since the compiler doesn't
+       * actually dereference any of the gl_constant_value pointers in the
+       * params array, it doesn't really matter what we put here.
+       */
+      struct anv_push_constants *null_data = NULL;
+      if (nir->num_uniforms > 0) {
+         /* Fill out the push constants section of the param array */
+         for (unsigned i = 0; i < MAX_PUSH_CONSTANTS_SIZE / sizeof(float); i++)
+            prog_data->param[i] = (const union gl_constant_value *)
+               &null_data->client_data[i * sizeof(float)];
+      }
+   }
+
+   /* Set up dynamic offsets */
+   anv_nir_apply_dynamic_offsets(pipeline, nir, prog_data);
+
+   /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
+   if (pipeline->layout)
+      anv_nir_apply_pipeline_layout(nir, prog_data, pipeline->layout);
+
+   /* All binding table offsets provided by apply_pipeline_layout() are
+    * relative to the start of the bindint table (plus MAX_RTS for VS).
+    */
+   unsigned bias;
+   switch (stage) {
+   case MESA_SHADER_FRAGMENT:
+      bias = MAX_RTS;
+      break;
+   case MESA_SHADER_COMPUTE:
+      bias = 1;
+      break;
+   default:
+      bias = 0;
+      break;
+   }
+   prog_data->binding_table.size_bytes = 0;
+   prog_data->binding_table.texture_start = bias;
+   prog_data->binding_table.ubo_start = bias;
+   prog_data->binding_table.ssbo_start = bias;
+   prog_data->binding_table.image_start = bias;
+
+   /* Finish the optimization and compilation process */
+   nir = brw_nir_lower_io(nir, &pipeline->device->info,
+                          compiler->scalar_stage[stage]);
+
+   /* nir_lower_io will only handle the push constants; we need to set this
+    * to the full number of possible uniforms.
+    */
+   nir->num_uniforms = prog_data->nr_params * 4;
+
+   return nir;
+}
+
+static void
+anv_pipeline_add_compiled_stage(struct anv_pipeline *pipeline,
+                                gl_shader_stage stage,
+                                struct brw_stage_prog_data *prog_data)
+{
+   struct brw_device_info *devinfo = &pipeline->device->info;
+   uint32_t max_threads[] = {
+      [MESA_SHADER_VERTEX]                  = devinfo->max_vs_threads,
+      [MESA_SHADER_TESS_CTRL]               = 0,
+      [MESA_SHADER_TESS_EVAL]               = 0,
+      [MESA_SHADER_GEOMETRY]                = devinfo->max_gs_threads,
+      [MESA_SHADER_FRAGMENT]                = devinfo->max_wm_threads,
+      [MESA_SHADER_COMPUTE]                 = devinfo->max_cs_threads,
+   };
+
+   pipeline->prog_data[stage] = prog_data;
+   pipeline->active_stages |= mesa_to_vk_shader_stage(stage);
+   pipeline->scratch_start[stage] = pipeline->total_scratch;
+   pipeline->total_scratch =
+      align_u32(pipeline->total_scratch, 1024) +
+      prog_data->total_scratch * max_threads[stage];
+}
+
+static VkResult
+anv_pipeline_compile_vs(struct anv_pipeline *pipeline,
+                        struct anv_pipeline_cache *cache,
+                        const VkGraphicsPipelineCreateInfo *info,
+                        struct anv_shader_module *module,
+                        const char *entrypoint,
+                        const VkSpecializationInfo *spec_info)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+   struct brw_vs_prog_data *prog_data = &pipeline->vs_prog_data;
+   struct brw_vs_prog_key key;
+
+   populate_vs_prog_key(&pipeline->device->info, &key);
+
+   /* TODO: Look up shader in cache */
+
+   memset(prog_data, 0, sizeof(*prog_data));
+
+   nir_shader *nir = anv_pipeline_compile(pipeline, module, entrypoint,
+                                          MESA_SHADER_VERTEX, spec_info,
+                                          &prog_data->base.base);
+   if (nir == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   if (module->nir == NULL)
+      ralloc_steal(mem_ctx, nir);
+
+   prog_data->inputs_read = nir->info.inputs_read;
+   if (nir->info.outputs_written & (1ull << VARYING_SLOT_PSIZ))
+      pipeline->writes_point_size = true;
+
+   brw_compute_vue_map(&pipeline->device->info,
+                       &prog_data->base.vue_map,
+                       nir->info.outputs_written,
+                       nir->info.separate_shader);
+
+   unsigned code_size;
+   const unsigned *shader_code =
+      brw_compile_vs(compiler, NULL, mem_ctx, &key, prog_data, nir,
+                     NULL, false, -1, &code_size, NULL);
+   if (shader_code == NULL) {
+      ralloc_free(mem_ctx);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   const uint32_t offset =
+      anv_pipeline_cache_upload_kernel(cache, shader_code, code_size);
+   if (prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8) {
+      pipeline->vs_simd8 = offset;
+      pipeline->vs_vec4 = NO_KERNEL;
+   } else {
+      pipeline->vs_simd8 = NO_KERNEL;
+      pipeline->vs_vec4 = offset;
+   }
+
+   ralloc_free(mem_ctx);
+
+   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_VERTEX,
+                                   &prog_data->base.base);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_pipeline_compile_gs(struct anv_pipeline *pipeline,
+                        struct anv_pipeline_cache *cache,
+                        const VkGraphicsPipelineCreateInfo *info,
+                        struct anv_shader_module *module,
+                        const char *entrypoint,
+                        const VkSpecializationInfo *spec_info)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+   struct brw_gs_prog_data *prog_data = &pipeline->gs_prog_data;
+   struct brw_gs_prog_key key;
+
+   populate_gs_prog_key(&pipeline->device->info, &key);
+
+   /* TODO: Look up shader in cache */
+
+   memset(prog_data, 0, sizeof(*prog_data));
+
+   nir_shader *nir = anv_pipeline_compile(pipeline, module, entrypoint,
+                                          MESA_SHADER_GEOMETRY, spec_info,
+                                          &prog_data->base.base);
+   if (nir == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   if (module->nir == NULL)
+      ralloc_steal(mem_ctx, nir);
+
+   if (nir->info.outputs_written & (1ull << VARYING_SLOT_PSIZ))
+      pipeline->writes_point_size = true;
+
+   brw_compute_vue_map(&pipeline->device->info,
+                       &prog_data->base.vue_map,
+                       nir->info.outputs_written,
+                       nir->info.separate_shader);
+
+   unsigned code_size;
+   const unsigned *shader_code =
+      brw_compile_gs(compiler, NULL, mem_ctx, &key, prog_data, nir,
+                     NULL, -1, &code_size, NULL);
+   if (shader_code == NULL) {
+      ralloc_free(mem_ctx);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   /* TODO: SIMD8 GS */
+   pipeline->gs_kernel =
+      anv_pipeline_cache_upload_kernel(cache, shader_code, code_size);
+   pipeline->gs_vertex_count = nir->info.gs.vertices_in;
+
+   ralloc_free(mem_ctx);
+
+   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_GEOMETRY,
+                                   &prog_data->base.base);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
+                        struct anv_pipeline_cache *cache,
+                        const VkGraphicsPipelineCreateInfo *info,
+                        const struct anv_graphics_pipeline_create_info *extra,
+                        struct anv_shader_module *module,
+                        const char *entrypoint,
+                        const VkSpecializationInfo *spec_info)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+   struct brw_wm_prog_data *prog_data = &pipeline->wm_prog_data;
+   struct brw_wm_prog_key key;
+
+   populate_wm_prog_key(&pipeline->device->info, info, extra, &key);
+
+   if (pipeline->use_repclear)
+      key.nr_color_regions = 1;
+
+   /* TODO: Look up shader in cache */
+
+   memset(prog_data, 0, sizeof(*prog_data));
+
+   prog_data->binding_table.render_target_start = 0;
+
+   nir_shader *nir = anv_pipeline_compile(pipeline, module, entrypoint,
+                                          MESA_SHADER_FRAGMENT, spec_info,
+                                          &prog_data->base);
+   if (nir == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   if (module->nir == NULL)
+      ralloc_steal(mem_ctx, nir);
+
+   unsigned code_size;
+   const unsigned *shader_code =
+      brw_compile_fs(compiler, NULL, mem_ctx, &key, prog_data, nir,
+                     NULL, -1, -1, pipeline->use_repclear, &code_size, NULL);
+   if (shader_code == NULL) {
+      ralloc_free(mem_ctx);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   uint32_t offset =
+      anv_pipeline_cache_upload_kernel(cache, shader_code, code_size);
+   if (prog_data->no_8)
+      pipeline->ps_simd8 = NO_KERNEL;
+   else
+      pipeline->ps_simd8 = offset;
+
+   if (prog_data->no_8 || prog_data->prog_offset_16) {
+      pipeline->ps_simd16 = offset + prog_data->prog_offset_16;
+   } else {
+      pipeline->ps_simd16 = NO_KERNEL;
+   }
+
+   pipeline->ps_ksp2 = 0;
+   pipeline->ps_grf_start2 = 0;
+   if (pipeline->ps_simd8 != NO_KERNEL) {
+      pipeline->ps_ksp0 = pipeline->ps_simd8;
+      pipeline->ps_grf_start0 = prog_data->base.dispatch_grf_start_reg;
+      if (pipeline->ps_simd16 != NO_KERNEL) {
+         pipeline->ps_ksp2 = pipeline->ps_simd16;
+         pipeline->ps_grf_start2 = prog_data->dispatch_grf_start_reg_16;
+      }
+   } else if (pipeline->ps_simd16 != NO_KERNEL) {
+      pipeline->ps_ksp0 = pipeline->ps_simd16;
+      pipeline->ps_grf_start0 = prog_data->dispatch_grf_start_reg_16;
+   }
+
+   ralloc_free(mem_ctx);
+
+   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_FRAGMENT,
+                                   &prog_data->base);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
+                        struct anv_pipeline_cache *cache,
+                        const VkComputePipelineCreateInfo *info,
+                        struct anv_shader_module *module,
+                        const char *entrypoint,
+                        const VkSpecializationInfo *spec_info)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+   struct brw_cs_prog_key key;
+
+   populate_cs_prog_key(&pipeline->device->info, &key);
+
+   /* TODO: Look up shader in cache */
+
+   memset(prog_data, 0, sizeof(*prog_data));
+
+   prog_data->binding_table.work_groups_start = 0;
+
+   nir_shader *nir = anv_pipeline_compile(pipeline, module, entrypoint,
+                                          MESA_SHADER_COMPUTE, spec_info,
+                                          &prog_data->base);
+   if (nir == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   prog_data->base.total_shared = nir->num_shared;
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   if (module->nir == NULL)
+      ralloc_steal(mem_ctx, nir);
+
+   unsigned code_size;
+   const unsigned *shader_code =
+      brw_compile_cs(compiler, NULL, mem_ctx, &key, prog_data, nir,
+                     -1, &code_size, NULL);
+   if (shader_code == NULL) {
+      ralloc_free(mem_ctx);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   pipeline->cs_simd =
+      anv_pipeline_cache_upload_kernel(cache, shader_code, code_size);
+   ralloc_free(mem_ctx);
+
+   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_COMPUTE,
+                                   &prog_data->base);
+
+   return VK_SUCCESS;
+}
+
+static const int gen8_push_size = 32 * 1024;
+
+static void
+gen7_compute_urb_partition(struct anv_pipeline *pipeline)
+{
+   const struct brw_device_info *devinfo = &pipeline->device->info;
+   bool vs_present = pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT;
+   unsigned vs_size = vs_present ? pipeline->vs_prog_data.base.urb_entry_size : 1;
+   unsigned vs_entry_size_bytes = vs_size * 64;
+   bool gs_present = pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT;
+   unsigned gs_size = gs_present ? pipeline->gs_prog_data.base.urb_entry_size : 1;
+   unsigned gs_entry_size_bytes = gs_size * 64;
+
+   /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
+    *
+    *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
+    *     Allocation Size is less than 9 512-bit URB entries.
+    *
+    * Similar text exists for GS.
+    */
+   unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
+   unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
+
+   /* URB allocations must be done in 8k chunks. */
+   unsigned chunk_size_bytes = 8192;
+
+   /* Determine the size of the URB in chunks. */
+   unsigned urb_chunks = devinfo->urb.size * 1024 / chunk_size_bytes;
+
+   /* Reserve space for push constants */
+   unsigned push_constant_bytes = gen8_push_size;
+   unsigned push_constant_chunks =
+      push_constant_bytes / chunk_size_bytes;
+
+   /* Initially, assign each stage the minimum amount of URB space it needs,
+    * and make a note of how much additional space it "wants" (the amount of
+    * additional space it could actually make use of).
+    */
+
+   /* VS has a lower limit on the number of URB entries */
+   unsigned vs_chunks =
+      ALIGN(devinfo->urb.min_vs_entries * vs_entry_size_bytes,
+            chunk_size_bytes) / chunk_size_bytes;
+   unsigned vs_wants =
+      ALIGN(devinfo->urb.max_vs_entries * vs_entry_size_bytes,
+            chunk_size_bytes) / chunk_size_bytes - vs_chunks;
+
+   unsigned gs_chunks = 0;
+   unsigned gs_wants = 0;
+   if (gs_present) {
+      /* There are two constraints on the minimum amount of URB space we can
+       * allocate:
+       *
+       * (1) We need room for at least 2 URB entries, since we always operate
+       * the GS in DUAL_OBJECT mode.
+       *
+       * (2) We can't allocate less than nr_gs_entries_granularity.
+       */
+      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
+                        chunk_size_bytes) / chunk_size_bytes;
+      gs_wants =
+         ALIGN(devinfo->urb.max_gs_entries * gs_entry_size_bytes,
+               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
+   }
+
+   /* There should always be enough URB space to satisfy the minimum
+    * requirements of each stage.
+    */
+   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
+   assert(total_needs <= urb_chunks);
+
+   /* Mete out remaining space (if any) in proportion to "wants". */
+   unsigned total_wants = vs_wants + gs_wants;
+   unsigned remaining_space = urb_chunks - total_needs;
+   if (remaining_space > total_wants)
+      remaining_space = total_wants;
+   if (remaining_space > 0) {
+      unsigned vs_additional = (unsigned)
+         round(vs_wants * (((double) remaining_space) / total_wants));
+      vs_chunks += vs_additional;
+      remaining_space -= vs_additional;
+      gs_chunks += remaining_space;
+   }
+
+   /* Sanity check that we haven't over-allocated. */
+   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
+
+   /* Finally, compute the number of entries that can fit in the space
+    * allocated to each stage.
+    */
+   unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
+   unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
+
+   /* Since we rounded up when computing *_wants, this may be slightly more
+    * than the maximum allowed amount, so correct for that.
+    */
+   nr_vs_entries = MIN2(nr_vs_entries, devinfo->urb.max_vs_entries);
+   nr_gs_entries = MIN2(nr_gs_entries, devinfo->urb.max_gs_entries);
+
+   /* Ensure that we program a multiple of the granularity. */
+   nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
+   nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
+
+   /* Finally, sanity check to make sure we have at least the minimum number
+    * of entries needed for each stage.
+    */
+   assert(nr_vs_entries >= devinfo->urb.min_vs_entries);
+   if (gs_present)
+      assert(nr_gs_entries >= 2);
+
+   /* Lay out the URB in the following order:
+    * - push constants
+    * - VS
+    * - GS
+    */
+   pipeline->urb.vs_start = push_constant_chunks;
+   pipeline->urb.vs_size = vs_size;
+   pipeline->urb.nr_vs_entries = nr_vs_entries;
+
+   pipeline->urb.gs_start = push_constant_chunks + vs_chunks;
+   pipeline->urb.gs_size = gs_size;
+   pipeline->urb.nr_gs_entries = nr_gs_entries;
+}
+
+static void
+anv_pipeline_init_dynamic_state(struct anv_pipeline *pipeline,
+                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
+{
+   anv_cmd_dirty_mask_t states = ANV_CMD_DIRTY_DYNAMIC_ALL;
+   ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
+   struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
+
+   pipeline->dynamic_state = default_dynamic_state;
+
+   if (pCreateInfo->pDynamicState) {
+      /* Remove all of the states that are marked as dynamic */
+      uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
+      for (uint32_t s = 0; s < count; s++)
+         states &= ~(1 << pCreateInfo->pDynamicState->pDynamicStates[s]);
+   }
+
+   struct anv_dynamic_state *dynamic = &pipeline->dynamic_state;
+
+   dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
+   if (states & (1 << VK_DYNAMIC_STATE_VIEWPORT)) {
+      typed_memcpy(dynamic->viewport.viewports,
+                   pCreateInfo->pViewportState->pViewports,
+                   pCreateInfo->pViewportState->viewportCount);
+   }
+
+   dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
+   if (states & (1 << VK_DYNAMIC_STATE_SCISSOR)) {
+      typed_memcpy(dynamic->scissor.scissors,
+                   pCreateInfo->pViewportState->pScissors,
+                   pCreateInfo->pViewportState->scissorCount);
+   }
+
+   if (states & (1 << VK_DYNAMIC_STATE_LINE_WIDTH)) {
+      assert(pCreateInfo->pRasterizationState);
+      dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth;
+   }
+
+   if (states & (1 << VK_DYNAMIC_STATE_DEPTH_BIAS)) {
+      assert(pCreateInfo->pRasterizationState);
+      dynamic->depth_bias.bias =
+         pCreateInfo->pRasterizationState->depthBiasConstantFactor;
+      dynamic->depth_bias.clamp =
+         pCreateInfo->pRasterizationState->depthBiasClamp;
+      dynamic->depth_bias.slope =
+         pCreateInfo->pRasterizationState->depthBiasSlopeFactor;
+   }
+
+   if (states & (1 << VK_DYNAMIC_STATE_BLEND_CONSTANTS)) {
+      assert(pCreateInfo->pColorBlendState);
+      typed_memcpy(dynamic->blend_constants,
+                   pCreateInfo->pColorBlendState->blendConstants, 4);
+   }
+
+   /* If there is no depthstencil attachment, then don't read
+    * pDepthStencilState. The Vulkan spec states that pDepthStencilState may
+    * be NULL in this case. Even if pDepthStencilState is non-NULL, there is
+    * no need to override the depthstencil defaults in
+    * anv_pipeline::dynamic_state when there is no depthstencil attachment.
+    *
+    * From the Vulkan spec (20 Oct 2015, git-aa308cb):
+    *
+    *    pDepthStencilState [...] may only be NULL if renderPass and subpass
+    *    specify a subpass that has no depth/stencil attachment.
+    */
+   if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
+      if (states & (1 << VK_DYNAMIC_STATE_DEPTH_BOUNDS)) {
+         assert(pCreateInfo->pDepthStencilState);
+         dynamic->depth_bounds.min =
+            pCreateInfo->pDepthStencilState->minDepthBounds;
+         dynamic->depth_bounds.max =
+            pCreateInfo->pDepthStencilState->maxDepthBounds;
+      }
+
+      if (states & (1 << VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK)) {
+         assert(pCreateInfo->pDepthStencilState);
+         dynamic->stencil_compare_mask.front =
+            pCreateInfo->pDepthStencilState->front.compareMask;
+         dynamic->stencil_compare_mask.back =
+            pCreateInfo->pDepthStencilState->back.compareMask;
+      }
+
+      if (states & (1 << VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) {
+         assert(pCreateInfo->pDepthStencilState);
+         dynamic->stencil_write_mask.front =
+            pCreateInfo->pDepthStencilState->front.writeMask;
+         dynamic->stencil_write_mask.back =
+            pCreateInfo->pDepthStencilState->back.writeMask;
+      }
+
+      if (states & (1 << VK_DYNAMIC_STATE_STENCIL_REFERENCE)) {
+         assert(pCreateInfo->pDepthStencilState);
+         dynamic->stencil_reference.front =
+            pCreateInfo->pDepthStencilState->front.reference;
+         dynamic->stencil_reference.back =
+            pCreateInfo->pDepthStencilState->back.reference;
+      }
+   }
+
+   pipeline->dynamic_state_mask = states;
+}
+
+static void
+anv_pipeline_validate_create_info(const VkGraphicsPipelineCreateInfo *info)
+{
+   struct anv_render_pass *renderpass = NULL;
+   struct anv_subpass *subpass = NULL;
+
+   /* Assert that all required members of VkGraphicsPipelineCreateInfo are
+    * present, as explained by the Vulkan (20 Oct 2015, git-aa308cb), Section
+    * 4.2 Graphics Pipeline.
+    */
+   assert(info->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
+
+   renderpass = anv_render_pass_from_handle(info->renderPass);
+   assert(renderpass);
+
+   if (renderpass != &anv_meta_dummy_renderpass) {
+      assert(info->subpass < renderpass->subpass_count);
+      subpass = &renderpass->subpasses[info->subpass];
+   }
+
+   assert(info->stageCount >= 1);
+   assert(info->pVertexInputState);
+   assert(info->pInputAssemblyState);
+   assert(info->pViewportState);
+   assert(info->pRasterizationState);
+
+   if (subpass && subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED)
+      assert(info->pDepthStencilState);
+
+   if (subpass && subpass->color_count > 0)
+      assert(info->pColorBlendState);
+
+   for (uint32_t i = 0; i < info->stageCount; ++i) {
+      switch (info->pStages[i].stage) {
+      case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
+      case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
+         assert(info->pTessellationState);
+         break;
+      default:
+         break;
+      }
+   }
+}
+
+VkResult
+anv_pipeline_init(struct anv_pipeline *pipeline,
+                  struct anv_device *device,
+                  struct anv_pipeline_cache *cache,
+                  const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                  const struct anv_graphics_pipeline_create_info *extra,
+                  const VkAllocationCallbacks *alloc)
+{
+   VkResult result;
+
+   anv_validate {
+      anv_pipeline_validate_create_info(pCreateInfo);
+   }
+
+   if (alloc == NULL)
+      alloc = &device->alloc;
+
+   pipeline->device = device;
+   pipeline->layout = anv_pipeline_layout_from_handle(pCreateInfo->layout);
+
+   result = anv_reloc_list_init(&pipeline->batch_relocs, alloc);
+   if (result != VK_SUCCESS)
+      return result;
+
+   pipeline->batch.alloc = alloc;
+   pipeline->batch.next = pipeline->batch.start = pipeline->batch_data;
+   pipeline->batch.end = pipeline->batch.start + sizeof(pipeline->batch_data);
+   pipeline->batch.relocs = &pipeline->batch_relocs;
+
+   anv_pipeline_init_dynamic_state(pipeline, pCreateInfo);
+
+   if (pCreateInfo->pTessellationState)
+      anv_finishme("VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO");
+   if (pCreateInfo->pMultisampleState &&
+       pCreateInfo->pMultisampleState->rasterizationSamples > 1)
+      anv_finishme("VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO");
+
+   pipeline->use_repclear = extra && extra->use_repclear;
+   pipeline->writes_point_size = false;
+
+   /* When we free the pipeline, we detect stages based on the NULL status
+    * of various prog_data pointers.  Make them NULL by default.
+    */
+   memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
+   memset(pipeline->scratch_start, 0, sizeof(pipeline->scratch_start));
+
+   pipeline->vs_simd8 = NO_KERNEL;
+   pipeline->vs_vec4 = NO_KERNEL;
+   pipeline->gs_kernel = NO_KERNEL;
+
+   pipeline->active_stages = 0;
+   pipeline->total_scratch = 0;
+
+   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
+      ANV_FROM_HANDLE(anv_shader_module, module,
+                      pCreateInfo->pStages[i].module);
+
+      switch (pCreateInfo->pStages[i].stage) {
+      case VK_SHADER_STAGE_VERTEX_BIT:
+         anv_pipeline_compile_vs(pipeline, cache, pCreateInfo, module,
+                                 pCreateInfo->pStages[i].pName,
+                                 pCreateInfo->pStages[i].pSpecializationInfo);
+         break;
+      case VK_SHADER_STAGE_GEOMETRY_BIT:
+         anv_pipeline_compile_gs(pipeline, cache, pCreateInfo, module,
+                                 pCreateInfo->pStages[i].pName,
+                                 pCreateInfo->pStages[i].pSpecializationInfo);
+         break;
+      case VK_SHADER_STAGE_FRAGMENT_BIT:
+         anv_pipeline_compile_fs(pipeline, cache, pCreateInfo, extra, module,
+                                 pCreateInfo->pStages[i].pName,
+                                 pCreateInfo->pStages[i].pSpecializationInfo);
+         break;
+      default:
+         anv_finishme("Unsupported shader stage");
+      }
+   }
+
+   if (!(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT)) {
+      /* Vertex is only optional if disable_vs is set */
+      assert(extra->disable_vs);
+      memset(&pipeline->vs_prog_data, 0, sizeof(pipeline->vs_prog_data));
+   }
+
+   gen7_compute_urb_partition(pipeline);
+
+   const VkPipelineVertexInputStateCreateInfo *vi_info =
+      pCreateInfo->pVertexInputState;
+
+   uint64_t inputs_read;
+   if (extra && extra->disable_vs) {
+      /* If the VS is disabled, just assume the user knows what they're
+       * doing and apply the layout blindly.  This can only come from
+       * meta, so this *should* be safe.
+       */
+      inputs_read = ~0ull;
+   } else {
+      inputs_read = pipeline->vs_prog_data.inputs_read;
+   }
+
+   pipeline->vb_used = 0;
+   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
+      const VkVertexInputAttributeDescription *desc =
+         &vi_info->pVertexAttributeDescriptions[i];
+
+      if (inputs_read & (1 << (VERT_ATTRIB_GENERIC0 + desc->location)))
+         pipeline->vb_used |= 1 << desc->binding;
+   }
+
+   for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
+      const VkVertexInputBindingDescription *desc =
+         &vi_info->pVertexBindingDescriptions[i];
+
+      pipeline->binding_stride[desc->binding] = desc->stride;
+
+      /* Step rate is programmed per vertex element (attribute), not
+       * binding. Set up a map of which bindings step per instance, for
+       * reference by vertex element setup. */
+      switch (desc->inputRate) {
+      default:
+      case VK_VERTEX_INPUT_RATE_VERTEX:
+         pipeline->instancing_enable[desc->binding] = false;
+         break;
+      case VK_VERTEX_INPUT_RATE_INSTANCE:
+         pipeline->instancing_enable[desc->binding] = true;
+         break;
+      }
+   }
+
+   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
+      pCreateInfo->pInputAssemblyState;
+   pipeline->primitive_restart = ia_info->primitiveRestartEnable;
+   pipeline->topology = vk_to_gen_primitive_type[ia_info->topology];
+
+   if (extra && extra->use_rectlist)
+      pipeline->topology = _3DPRIM_RECTLIST;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_graphics_pipeline_create(
+   VkDevice _device,
+   VkPipelineCache _cache,
+   const VkGraphicsPipelineCreateInfo *pCreateInfo,
+   const struct anv_graphics_pipeline_create_info *extra,
+   const VkAllocationCallbacks *pAllocator,
+   VkPipeline *pPipeline)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
+
+   if (cache == NULL)
+      cache = &device->default_pipeline_cache;
+
+   switch (device->info.gen) {
+   case 7:
+      if (device->info.is_haswell)
+         return gen75_graphics_pipeline_create(_device, cache, pCreateInfo, extra, pAllocator, pPipeline);
+      else
+         return gen7_graphics_pipeline_create(_device, cache, pCreateInfo, extra, pAllocator, pPipeline);
+   case 8:
+      return gen8_graphics_pipeline_create(_device, cache, pCreateInfo, extra, pAllocator, pPipeline);
+   case 9:
+      return gen9_graphics_pipeline_create(_device, cache, pCreateInfo, extra, pAllocator, pPipeline);
+   default:
+      unreachable("unsupported gen\n");
+   }
+}
+
+VkResult anv_CreateGraphicsPipelines(
+    VkDevice                                    _device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    count,
+    const VkGraphicsPipelineCreateInfo*         pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines)
+{
+   VkResult result = VK_SUCCESS;
+
+   unsigned i = 0;
+   for (; i < count; i++) {
+      result = anv_graphics_pipeline_create(_device,
+                                            pipelineCache,
+                                            &pCreateInfos[i],
+                                            NULL, pAllocator, &pPipelines[i]);
+      if (result != VK_SUCCESS) {
+         for (unsigned j = 0; j < i; j++) {
+            anv_DestroyPipeline(_device, pPipelines[j], pAllocator);
+         }
+
+         return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult anv_compute_pipeline_create(
+    VkDevice                                    _device,
+    VkPipelineCache                             _cache,
+    const VkComputePipelineCreateInfo*          pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipeline)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
+
+   if (cache == NULL)
+      cache = &device->default_pipeline_cache;
+
+   switch (device->info.gen) {
+   case 7:
+      if (device->info.is_haswell)
+         return gen75_compute_pipeline_create(_device, cache, pCreateInfo, pAllocator, pPipeline);
+      else
+         return gen7_compute_pipeline_create(_device, cache, pCreateInfo, pAllocator, pPipeline);
+   case 8:
+      return gen8_compute_pipeline_create(_device, cache, pCreateInfo, pAllocator, pPipeline);
+   case 9:
+      return gen9_compute_pipeline_create(_device, cache, pCreateInfo, pAllocator, pPipeline);
+   default:
+      unreachable("unsupported gen\n");
+   }
+}
+
+VkResult anv_CreateComputePipelines(
+    VkDevice                                    _device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    count,
+    const VkComputePipelineCreateInfo*          pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines)
+{
+   VkResult result = VK_SUCCESS;
+
+   unsigned i = 0;
+   for (; i < count; i++) {
+      result = anv_compute_pipeline_create(_device, pipelineCache,
+                                           &pCreateInfos[i],
+                                           pAllocator, &pPipelines[i]);
+      if (result != VK_SUCCESS) {
+         for (unsigned j = 0; j < i; j++) {
+            anv_DestroyPipeline(_device, pPipelines[j], pAllocator);
+         }
+
+         return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
diff --git a/src/vulkan/anv_private.h b/src/vulkan/anv_private.h

new file mode 100644 (file)

index 0000000..05a6342
--- /dev/null
+++ b/src/vulkan/anv_private.h
@@ -0,0 +1,1796 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <assert.h>
+#include <stdint.h>
+#include <i915_drm.h>
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
+#else
+#define VG(x)
+#endif
+
+#include "brw_device_info.h"
+#include "util/macros.h"
+#include "util/list.h"
+
+/* Pre-declarations needed for WSI entrypoints */
+struct wl_surface;
+struct wl_display;
+typedef struct xcb_connection_t xcb_connection_t;
+typedef uint32_t xcb_visualid_t;
+typedef uint32_t xcb_window_t;
+
+#define VK_USE_PLATFORM_XCB_KHR
+#define VK_USE_PLATFORM_WAYLAND_KHR
+
+#define VK_PROTOTYPES
+#include <vulkan/vulkan.h>
+#include <vulkan/vulkan_intel.h>
+
+#include "anv_entrypoints.h"
+#include "anv_gen_macros.h"
+#include "brw_context.h"
+#include "isl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_VBS         32
+#define MAX_SETS         8
+#define MAX_RTS          8
+#define MAX_VIEWPORTS   16
+#define MAX_SCISSORS    16
+#define MAX_PUSH_CONSTANTS_SIZE 128
+#define MAX_DYNAMIC_BUFFERS 16
+#define MAX_IMAGES 8
+
+#define ICD_LOADER_MAGIC   0x01CDC0DE
+
+typedef union _VK_LOADER_DATA {
+  uintptr_t loaderMagic;
+  void *loaderData;
+} VK_LOADER_DATA;
+
+#define anv_noreturn __attribute__((__noreturn__))
+#define anv_printflike(a, b) __attribute__((__format__(__printf__, a, b)))
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+static inline uint64_t
+align_u64(uint64_t v, uint64_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+static inline int32_t
+align_i32(int32_t v, int32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+/** Alignment must be a power of 2. */
+static inline bool
+anv_is_aligned(uintmax_t n, uintmax_t a)
+{
+   assert(a == (a & -a));
+   return (n & (a - 1)) == 0;
+}
+
+static inline uint32_t
+anv_minify(uint32_t n, uint32_t levels)
+{
+   if (unlikely(n == 0))
+      return 0;
+   else
+      return MAX(n >> levels, 1);
+}
+
+static inline float
+anv_clamp_f(float f, float min, float max)
+{
+   assert(min < max);
+
+   if (f > max)
+      return max;
+   else if (f < min)
+      return min;
+   else
+      return f;
+}
+
+static inline bool
+anv_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
+{
+   if (*inout_mask & clear_mask) {
+      *inout_mask &= ~clear_mask;
+      return true;
+   } else {
+      return false;
+   }
+}
+
+#define for_each_bit(b, dword)                          \
+   for (uint32_t __dword = (dword);                     \
+        (b) = __builtin_ffs(__dword) - 1, __dword;      \
+        __dword &= ~(1 << (b)))
+
+#define typed_memcpy(dest, src, count) ({ \
+   static_assert(sizeof(*src) == sizeof(*dest), ""); \
+   memcpy((dest), (src), (count) * sizeof(*(src))); \
+})
+
+#define zero(x) (memset(&(x), 0, sizeof(x)))
+
+/* Define no kernel as 1, since that's an illegal offset for a kernel */
+#define NO_KERNEL 1
+
+struct anv_common {
+    VkStructureType                             sType;
+    const void*                                 pNext;
+};
+
+/* Whenever we generate an error, pass it through this function. Useful for
+ * debugging, where we can break on it. Only call at error site, not when
+ * propagating errors. Might be useful to plug in a stack trace here.
+ */
+
+VkResult __vk_errorf(VkResult error, const char *file, int line, const char *format, ...);
+
+#ifdef DEBUG
+#define vk_error(error) __vk_errorf(error, __FILE__, __LINE__, NULL);
+#define vk_errorf(error, format, ...) __vk_errorf(error, __FILE__, __LINE__, format, ## __VA_ARGS__);
+#else
+#define vk_error(error) error
+#define vk_errorf(error, format, ...) error
+#endif
+
+void __anv_finishme(const char *file, int line, const char *format, ...)
+   anv_printflike(3, 4);
+void anv_loge(const char *format, ...) anv_printflike(1, 2);
+void anv_loge_v(const char *format, va_list va);
+
+/**
+ * Print a FINISHME message, including its source location.
+ */
+#define anv_finishme(format, ...) \
+   __anv_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__);
+
+/* A non-fatal assert.  Useful for debugging. */
+#ifdef DEBUG
+#define anv_assert(x) ({ \
+   if (unlikely(!(x))) \
+      fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x); \
+})
+#else
+#define anv_assert(x)
+#endif
+
+/**
+ * If a block of code is annotated with anv_validate, then the block runs only
+ * in debug builds.
+ */
+#ifdef DEBUG
+#define anv_validate if (1)
+#else
+#define anv_validate if (0)
+#endif
+
+void anv_abortf(const char *format, ...) anv_noreturn anv_printflike(1, 2);
+void anv_abortfv(const char *format, va_list va) anv_noreturn;
+
+#define stub_return(v) \
+   do { \
+      anv_finishme("stub %s", __func__); \
+      return (v); \
+   } while (0)
+
+#define stub() \
+   do { \
+      anv_finishme("stub %s", __func__); \
+      return; \
+   } while (0)
+
+/**
+ * A dynamically growable, circular buffer.  Elements are added at head and
+ * removed from tail. head and tail are free-running uint32_t indices and we
+ * only compute the modulo with size when accessing the array.  This way,
+ * number of bytes in the queue is always head - tail, even in case of
+ * wraparound.
+ */
+
+struct anv_vector {
+   uint32_t head;
+   uint32_t tail;
+   uint32_t element_size;
+   uint32_t size;
+   void *data;
+};
+
+int anv_vector_init(struct anv_vector *queue, uint32_t element_size, uint32_t size);
+void *anv_vector_add(struct anv_vector *queue);
+void *anv_vector_remove(struct anv_vector *queue);
+
+static inline int
+anv_vector_length(struct anv_vector *queue)
+{
+   return (queue->head - queue->tail) / queue->element_size;
+}
+
+static inline void *
+anv_vector_head(struct anv_vector *vector)
+{
+   assert(vector->tail < vector->head);
+   return (void *)((char *)vector->data +
+                   ((vector->head - vector->element_size) &
+                    (vector->size - 1)));
+}
+
+static inline void *
+anv_vector_tail(struct anv_vector *vector)
+{
+   return (void *)((char *)vector->data + (vector->tail & (vector->size - 1)));
+}
+
+static inline void
+anv_vector_finish(struct anv_vector *queue)
+{
+   free(queue->data);
+}
+
+#define anv_vector_foreach(elem, queue)                                  \
+   static_assert(__builtin_types_compatible_p(__typeof__(queue), struct anv_vector *), ""); \
+   for (uint32_t __anv_vector_offset = (queue)->tail;                                \
+        elem = (queue)->data + (__anv_vector_offset & ((queue)->size - 1)), __anv_vector_offset < (queue)->head; \
+        __anv_vector_offset += (queue)->element_size)
+
+struct anv_bo {
+   uint32_t gem_handle;
+
+   /* Index into the current validation list.  This is used by the
+    * validation list building alrogithm to track which buffers are already
+    * in the validation list so that we can ensure uniqueness.
+    */
+   uint32_t index;
+
+   /* Last known offset.  This value is provided by the kernel when we
+    * execbuf and is used as the presumed offset for the next bunch of
+    * relocations.
+    */
+   uint64_t offset;
+
+   uint64_t size;
+   void *map;
+};
+
+/* Represents a lock-free linked list of "free" things.  This is used by
+ * both the block pool and the state pools.  Unfortunately, in order to
+ * solve the ABA problem, we can't use a single uint32_t head.
+ */
+union anv_free_list {
+   struct {
+      int32_t offset;
+
+      /* A simple count that is incremented every time the head changes. */
+      uint32_t count;
+   };
+   uint64_t u64;
+};
+
+#define ANV_FREE_LIST_EMPTY ((union anv_free_list) { { 1, 0 } })
+
+struct anv_block_state {
+   union {
+      struct {
+         uint32_t next;
+         uint32_t end;
+      };
+      uint64_t u64;
+   };
+};
+
+struct anv_block_pool {
+   struct anv_device *device;
+
+   struct anv_bo bo;
+
+   /* The offset from the start of the bo to the "center" of the block
+    * pool.  Pointers to allocated blocks are given by
+    * bo.map + center_bo_offset + offsets.
+    */
+   uint32_t center_bo_offset;
+
+   /* Current memory map of the block pool.  This pointer may or may not
+    * point to the actual beginning of the block pool memory.  If
+    * anv_block_pool_alloc_back has ever been called, then this pointer
+    * will point to the "center" position of the buffer and all offsets
+    * (negative or positive) given out by the block pool alloc functions
+    * will be valid relative to this pointer.
+    *
+    * In particular, map == bo.map + center_offset
+    */
+   void *map;
+   int fd;
+
+   /**
+    * Array of mmaps and gem handles owned by the block pool, reclaimed when
+    * the block pool is destroyed.
+    */
+   struct anv_vector mmap_cleanups;
+
+   uint32_t block_size;
+
+   union anv_free_list free_list;
+   struct anv_block_state state;
+
+   union anv_free_list back_free_list;
+   struct anv_block_state back_state;
+};
+
+/* Block pools are backed by a fixed-size 2GB memfd */
+#define BLOCK_POOL_MEMFD_SIZE (1ull << 32)
+
+/* The center of the block pool is also the middle of the memfd.  This may
+ * change in the future if we decide differently for some reason.
+ */
+#define BLOCK_POOL_MEMFD_CENTER (BLOCK_POOL_MEMFD_SIZE / 2)
+
+static inline uint32_t
+anv_block_pool_size(struct anv_block_pool *pool)
+{
+   return pool->state.end + pool->back_state.end;
+}
+
+struct anv_state {
+   int32_t offset;
+   uint32_t alloc_size;
+   void *map;
+};
+
+struct anv_fixed_size_state_pool {
+   size_t state_size;
+   union anv_free_list free_list;
+   struct anv_block_state block;
+};
+
+#define ANV_MIN_STATE_SIZE_LOG2 6
+#define ANV_MAX_STATE_SIZE_LOG2 10
+
+#define ANV_STATE_BUCKETS (ANV_MAX_STATE_SIZE_LOG2 - ANV_MIN_STATE_SIZE_LOG2)
+
+struct anv_state_pool {
+   struct anv_block_pool *block_pool;
+   struct anv_fixed_size_state_pool buckets[ANV_STATE_BUCKETS];
+};
+
+struct anv_state_stream_block;
+
+struct anv_state_stream {
+   struct anv_block_pool *block_pool;
+
+   /* The current working block */
+   struct anv_state_stream_block *block;
+
+   /* Offset at which the current block starts */
+   uint32_t start;
+   /* Offset at which to allocate the next state */
+   uint32_t next;
+   /* Offset at which the current block ends */
+   uint32_t end;
+};
+
+#define CACHELINE_SIZE 64
+#define CACHELINE_MASK 63
+
+static void inline
+anv_state_clflush(struct anv_state state)
+{
+   /* state.map may not be cacheline aligned, so round down the start pointer
+    * to a cacheline boundary so we flush all pages that contain the state.
+    */
+   void *end = state.map + state.alloc_size;
+   void *p = (void *) (((uintptr_t) state.map) & ~CACHELINE_MASK);
+
+   __builtin_ia32_sfence();
+   while (p < end) {
+      __builtin_ia32_clflush(p);
+      p += CACHELINE_SIZE;
+   }
+}
+
+void anv_block_pool_init(struct anv_block_pool *pool,
+                         struct anv_device *device, uint32_t block_size);
+void anv_block_pool_finish(struct anv_block_pool *pool);
+int32_t anv_block_pool_alloc(struct anv_block_pool *pool);
+int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool);
+void anv_block_pool_free(struct anv_block_pool *pool, int32_t offset);
+void anv_state_pool_init(struct anv_state_pool *pool,
+                         struct anv_block_pool *block_pool);
+void anv_state_pool_finish(struct anv_state_pool *pool);
+struct anv_state anv_state_pool_alloc(struct anv_state_pool *pool,
+                                      size_t state_size, size_t alignment);
+void anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state);
+void anv_state_stream_init(struct anv_state_stream *stream,
+                           struct anv_block_pool *block_pool);
+void anv_state_stream_finish(struct anv_state_stream *stream);
+struct anv_state anv_state_stream_alloc(struct anv_state_stream *stream,
+                                        uint32_t size, uint32_t alignment);
+
+/**
+ * Implements a pool of re-usable BOs.  The interface is identical to that
+ * of block_pool except that each block is its own BO.
+ */
+struct anv_bo_pool {
+   struct anv_device *device;
+
+   uint32_t bo_size;
+
+   void *free_list;
+};
+
+void anv_bo_pool_init(struct anv_bo_pool *pool,
+                      struct anv_device *device, uint32_t block_size);
+void anv_bo_pool_finish(struct anv_bo_pool *pool);
+VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo);
+void anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo);
+
+
+void *anv_resolve_entrypoint(uint32_t index);
+
+extern struct anv_dispatch_table dtable;
+
+#define ANV_CALL(func) ({ \
+   if (dtable.func == NULL) { \
+      size_t idx = offsetof(struct anv_dispatch_table, func) / sizeof(void *); \
+      dtable.entrypoints[idx] = anv_resolve_entrypoint(idx); \
+   } \
+   dtable.func; \
+})
+
+static inline void *
+anv_alloc(const VkAllocationCallbacks *alloc,
+          size_t size, size_t align,
+          VkSystemAllocationScope scope)
+{
+   return alloc->pfnAllocation(alloc->pUserData, size, align, scope);
+}
+
+static inline void *
+anv_realloc(const VkAllocationCallbacks *alloc,
+            void *ptr, size_t size, size_t align,
+            VkSystemAllocationScope scope)
+{
+   return alloc->pfnReallocation(alloc->pUserData, ptr, size, align, scope);
+}
+
+static inline void
+anv_free(const VkAllocationCallbacks *alloc, void *data)
+{
+   alloc->pfnFree(alloc->pUserData, data);
+}
+
+static inline void *
+anv_alloc2(const VkAllocationCallbacks *parent_alloc,
+           const VkAllocationCallbacks *alloc,
+           size_t size, size_t align,
+           VkSystemAllocationScope scope)
+{
+   if (alloc)
+      return anv_alloc(alloc, size, align, scope);
+   else
+      return anv_alloc(parent_alloc, size, align, scope);
+}
+
+static inline void
+anv_free2(const VkAllocationCallbacks *parent_alloc,
+          const VkAllocationCallbacks *alloc,
+          void *data)
+{
+   if (alloc)
+      anv_free(alloc, data);
+   else
+      anv_free(parent_alloc, data);
+}
+
+struct anv_physical_device {
+    VK_LOADER_DATA                              _loader_data;
+
+    struct anv_instance *                       instance;
+    uint32_t                                    chipset_id;
+    const char *                                path;
+    const char *                                name;
+    const struct brw_device_info *              info;
+    uint64_t                                    aperture_size;
+    struct brw_compiler *                       compiler;
+    struct isl_device                           isl_dev;
+};
+
+struct anv_instance {
+    VK_LOADER_DATA                              _loader_data;
+
+    VkAllocationCallbacks                       alloc;
+
+    uint32_t                                    apiVersion;
+    int                                         physicalDeviceCount;
+    struct anv_physical_device                  physicalDevice;
+
+    void *                                      wayland_wsi;
+};
+
+VkResult anv_init_wsi(struct anv_instance *instance);
+void anv_finish_wsi(struct anv_instance *instance);
+
+struct anv_meta_state {
+   VkAllocationCallbacks alloc;
+
+   struct {
+      /**
+       * Pipeline N is used to clear color attachment N of the current
+       * subpass.
+       *
+       * HACK: We use one pipeline per color attachment to work around the
+       * compiler's inability to dynamically set the render target index of
+       * the render target write message.
+       */
+      struct anv_pipeline *color_pipelines[MAX_RTS];
+
+      struct anv_pipeline *depth_only_pipeline;
+      struct anv_pipeline *stencil_only_pipeline;
+      struct anv_pipeline *depthstencil_pipeline;
+   } clear;
+
+   struct {
+      VkRenderPass render_pass;
+
+      /** Pipeline that blits from a 1D image. */
+      VkPipeline pipeline_1d_src;
+
+      /** Pipeline that blits from a 2D image. */
+      VkPipeline pipeline_2d_src;
+
+      /** Pipeline that blits from a 3D image. */
+      VkPipeline pipeline_3d_src;
+
+      VkPipelineLayout                          pipeline_layout;
+      VkDescriptorSetLayout                     ds_layout;
+   } blit;
+};
+
+struct anv_queue {
+    VK_LOADER_DATA                              _loader_data;
+
+    struct anv_device *                         device;
+
+    struct anv_state_pool *                     pool;
+};
+
+struct anv_pipeline_cache {
+   struct anv_device *                          device;
+   struct anv_state_stream                      program_stream;
+   pthread_mutex_t                              mutex;
+};
+
+void anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
+                             struct anv_device *device);
+void anv_pipeline_cache_finish(struct anv_pipeline_cache *cache);
+
+struct anv_device {
+    VK_LOADER_DATA                              _loader_data;
+
+    VkAllocationCallbacks                       alloc;
+
+    struct anv_instance *                       instance;
+    uint32_t                                    chipset_id;
+    struct brw_device_info                      info;
+    struct isl_device                           isl_dev;
+    int                                         context_id;
+    int                                         fd;
+
+    struct anv_bo_pool                          batch_bo_pool;
+
+    struct anv_block_pool                       dynamic_state_block_pool;
+    struct anv_state_pool                       dynamic_state_pool;
+
+    struct anv_block_pool                       instruction_block_pool;
+    struct anv_pipeline_cache                   default_pipeline_cache;
+
+    struct anv_block_pool                       surface_state_block_pool;
+    struct anv_state_pool                       surface_state_pool;
+
+    struct anv_bo                               workaround_bo;
+
+    struct anv_meta_state                       meta_state;
+
+    struct anv_state                            border_colors;
+
+    struct anv_queue                            queue;
+
+    struct anv_block_pool                       scratch_block_pool;
+
+    pthread_mutex_t                             mutex;
+};
+
+void* anv_gem_mmap(struct anv_device *device,
+                   uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
+void anv_gem_munmap(void *p, uint64_t size);
+uint32_t anv_gem_create(struct anv_device *device, size_t size);
+void anv_gem_close(struct anv_device *device, uint32_t gem_handle);
+uint32_t anv_gem_userptr(struct anv_device *device, void *mem, size_t size);
+int anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns);
+int anv_gem_execbuffer(struct anv_device *device,
+                       struct drm_i915_gem_execbuffer2 *execbuf);
+int anv_gem_set_tiling(struct anv_device *device, uint32_t gem_handle,
+                       uint32_t stride, uint32_t tiling);
+int anv_gem_create_context(struct anv_device *device);
+int anv_gem_destroy_context(struct anv_device *device, int context);
+int anv_gem_get_param(int fd, uint32_t param);
+bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling);
+int anv_gem_get_aperture(int fd, uint64_t *size);
+int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
+uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
+int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
+int anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
+                       uint32_t read_domains, uint32_t write_domain);
+
+VkResult anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size);
+
+struct anv_reloc_list {
+   size_t                                       num_relocs;
+   size_t                                       array_length;
+   struct drm_i915_gem_relocation_entry *       relocs;
+   struct anv_bo **                             reloc_bos;
+};
+
+VkResult anv_reloc_list_init(struct anv_reloc_list *list,
+                             const VkAllocationCallbacks *alloc);
+void anv_reloc_list_finish(struct anv_reloc_list *list,
+                           const VkAllocationCallbacks *alloc);
+
+uint64_t anv_reloc_list_add(struct anv_reloc_list *list,
+                            const VkAllocationCallbacks *alloc,
+                            uint32_t offset, struct anv_bo *target_bo,
+                            uint32_t delta);
+
+struct anv_batch_bo {
+   /* Link in the anv_cmd_buffer.owned_batch_bos list */
+   struct list_head                             link;
+
+   struct anv_bo                                bo;
+
+   /* Bytes actually consumed in this batch BO */
+   size_t                                       length;
+
+   /* Last seen surface state block pool bo offset */
+   uint32_t                                     last_ss_pool_bo_offset;
+
+   struct anv_reloc_list                        relocs;
+};
+
+struct anv_batch {
+   const VkAllocationCallbacks *                alloc;
+
+   void *                                       start;
+   void *                                       end;
+   void *                                       next;
+
+   struct anv_reloc_list *                      relocs;
+
+   /* This callback is called (with the associated user data) in the event
+    * that the batch runs out of space.
+    */
+   VkResult (*extend_cb)(struct anv_batch *, void *);
+   void *                                       user_data;
+};
+
+void *anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords);
+void anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other);
+uint64_t anv_batch_emit_reloc(struct anv_batch *batch,
+                              void *location, struct anv_bo *bo, uint32_t offset);
+
+struct anv_address {
+   struct anv_bo *bo;
+   uint32_t offset;
+};
+
+#define __gen_address_type struct anv_address
+#define __gen_user_data struct anv_batch
+
+static inline uint64_t
+__gen_combine_address(struct anv_batch *batch, void *location,
+                      const struct anv_address address, uint32_t delta)
+{
+   if (address.bo == NULL) {
+      return address.offset + delta;
+   } else {
+      assert(batch->start <= location && location < batch->end);
+
+      return anv_batch_emit_reloc(batch, location, address.bo, address.offset + delta);
+   }
+}
+
+/* Wrapper macros needed to work around preprocessor argument issues.  In
+ * particular, arguments don't get pre-evaluated if they are concatenated.
+ * This means that, if you pass GENX(3DSTATE_PS) into the emit macro, the
+ * GENX macro won't get evaluated if the emit macro contains "cmd ## foo".
+ * We can work around this easily enough with these helpers.
+ */
+#define __anv_cmd_length(cmd) cmd ## _length
+#define __anv_cmd_length_bias(cmd) cmd ## _length_bias
+#define __anv_cmd_header(cmd) cmd ## _header
+#define __anv_cmd_pack(cmd) cmd ## _pack
+
+#define anv_batch_emit(batch, cmd, ...) do {                               \
+      void *__dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd));   \
+      struct cmd __template = {                                            \
+         __anv_cmd_header(cmd),                                            \
+         __VA_ARGS__                                                       \
+      };                                                                   \
+      __anv_cmd_pack(cmd)(batch, __dst, &__template);                      \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(__dst, __anv_cmd_length(cmd) * 4)); \
+   } while (0)
+
+#define anv_batch_emitn(batch, n, cmd, ...) ({          \
+      void *__dst = anv_batch_emit_dwords(batch, n);    \
+      struct cmd __template = {                         \
+         __anv_cmd_header(cmd),                         \
+        .DwordLength = n - __anv_cmd_length_bias(cmd),  \
+         __VA_ARGS__                                    \
+      };                                                \
+      __anv_cmd_pack(cmd)(batch, __dst, &__template);   \
+      __dst;                                            \
+   })
+
+#define anv_batch_emit_merge(batch, dwords0, dwords1)                   \
+   do {                                                                 \
+      uint32_t *dw;                                                     \
+                                                                        \
+      static_assert(ARRAY_SIZE(dwords0) == ARRAY_SIZE(dwords1), "mismatch merge"); \
+      dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0));         \
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
+         dw[i] = (dwords0)[i] | (dwords1)[i];                           \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));\
+   } while (0)
+
+#define anv_state_pool_emit(pool, cmd, align, ...) ({                   \
+      const uint32_t __size = __anv_cmd_length(cmd) * 4;                \
+      struct anv_state __state =                                        \
+         anv_state_pool_alloc((pool), __size, align);                   \
+      struct cmd __template = {                                         \
+         __VA_ARGS__                                                    \
+      };                                                                \
+      __anv_cmd_pack(cmd)(NULL, __state.map, &__template);              \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(__state.map, __anv_cmd_length(cmd) * 4)); \
+      if (!(pool)->block_pool->device->info.has_llc)                    \
+         anv_state_clflush(__state);                                    \
+      __state;                                                          \
+   })
+
+#define GEN7_MOCS (struct GEN7_MEMORY_OBJECT_CONTROL_STATE) {  \
+   .GraphicsDataTypeGFDT                        = 0,           \
+   .LLCCacheabilityControlLLCCC                 = 0,           \
+   .L3CacheabilityControlL3CC                   = 1,           \
+}
+
+#define GEN75_MOCS (struct GEN75_MEMORY_OBJECT_CONTROL_STATE) {  \
+   .LLCeLLCCacheabilityControlLLCCC             = 0,           \
+   .L3CacheabilityControlL3CC                   = 1,           \
+}
+
+#define GEN8_MOCS {                                     \
+      .MemoryTypeLLCeLLCCacheabilityControl = WB,       \
+      .TargetCache = L3DefertoPATforLLCeLLCselection,   \
+      .AgeforQUADLRU = 0                                \
+   }
+
+/* Skylake: MOCS is now an index into an array of 62 different caching
+ * configurations programmed by the kernel.
+ */
+
+#define GEN9_MOCS {                                     \
+      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */       \
+      .IndextoMOCSTables                           = 2  \
+   }
+
+#define GEN9_MOCS_PTE {                                 \
+      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */       \
+      .IndextoMOCSTables                           = 1  \
+   }
+
+struct anv_device_memory {
+   struct anv_bo                                bo;
+   uint32_t                                     type_index;
+   VkDeviceSize                                 map_size;
+   void *                                       map;
+};
+
+/**
+ * Header for Vertex URB Entry (VUE)
+ */
+struct anv_vue_header {
+   uint32_t Reserved;
+   uint32_t RTAIndex; /* RenderTargetArrayIndex */
+   uint32_t ViewportIndex;
+   float PointWidth;
+};
+
+struct anv_descriptor_set_binding_layout {
+   /* Number of array elements in this binding */
+   uint16_t array_size;
+
+   /* Index into the flattend descriptor set */
+   uint16_t descriptor_index;
+
+   /* Index into the dynamic state array for a dynamic buffer */
+   int16_t dynamic_offset_index;
+
+   /* Index into the descriptor set buffer views */
+   int16_t buffer_index;
+
+   struct {
+      /* Index into the binding table for the associated surface */
+      int16_t surface_index;
+
+      /* Index into the sampler table for the associated sampler */
+      int16_t sampler_index;
+
+      /* Index into the image table for the associated image */
+      int16_t image_index;
+   } stage[MESA_SHADER_STAGES];
+
+   /* Immutable samplers (or NULL if no immutable samplers) */
+   struct anv_sampler **immutable_samplers;
+};
+
+struct anv_descriptor_set_layout {
+   /* Number of bindings in this descriptor set */
+   uint16_t binding_count;
+
+   /* Total size of the descriptor set with room for all array entries */
+   uint16_t size;
+
+   /* Shader stages affected by this descriptor set */
+   uint16_t shader_stages;
+
+   /* Number of buffers in this descriptor set */
+   uint16_t buffer_count;
+
+   /* Number of dynamic offsets used by this descriptor set */
+   uint16_t dynamic_offset_count;
+
+   /* Bindings in this descriptor set */
+   struct anv_descriptor_set_binding_layout binding[0];
+};
+
+struct anv_descriptor {
+   VkDescriptorType type;
+
+   union {
+      struct {
+         union {
+            struct anv_image_view *image_view;
+         };
+         struct anv_sampler *sampler;
+      };
+
+      struct anv_buffer_view *buffer_view;
+   };
+};
+
+struct anv_descriptor_set {
+   const struct anv_descriptor_set_layout *layout;
+   struct anv_buffer_view *buffer_views;
+   struct anv_descriptor descriptors[0];
+};
+
+VkResult
+anv_descriptor_set_create(struct anv_device *device,
+                          const struct anv_descriptor_set_layout *layout,
+                          struct anv_descriptor_set **out_set);
+
+void
+anv_descriptor_set_destroy(struct anv_device *device,
+                           struct anv_descriptor_set *set);
+
+struct anv_pipeline_binding {
+   /* The descriptor set this surface corresponds to */
+   uint16_t set;
+
+   /* Offset into the descriptor set */
+   uint16_t offset;
+};
+
+struct anv_pipeline_layout {
+   struct {
+      struct anv_descriptor_set_layout *layout;
+      uint32_t dynamic_offset_start;
+      struct {
+         uint32_t surface_start;
+         uint32_t sampler_start;
+         uint32_t image_start;
+      } stage[MESA_SHADER_STAGES];
+   } set[MAX_SETS];
+
+   uint32_t num_sets;
+
+   struct {
+      bool has_dynamic_offsets;
+      uint32_t surface_count;
+      struct anv_pipeline_binding *surface_to_descriptor;
+      uint32_t sampler_count;
+      struct anv_pipeline_binding *sampler_to_descriptor;
+      uint32_t image_count;
+   } stage[MESA_SHADER_STAGES];
+
+   struct anv_pipeline_binding entries[0];
+};
+
+struct anv_buffer {
+   struct anv_device *                          device;
+   VkDeviceSize                                 size;
+
+   VkBufferUsageFlags                           usage;
+
+   /* Set when bound */
+   struct anv_bo *                              bo;
+   VkDeviceSize                                 offset;
+};
+
+enum anv_cmd_dirty_bits {
+   ANV_CMD_DIRTY_DYNAMIC_VIEWPORT                  = 1 << 0, /* VK_DYNAMIC_STATE_VIEWPORT */
+   ANV_CMD_DIRTY_DYNAMIC_SCISSOR                   = 1 << 1, /* VK_DYNAMIC_STATE_SCISSOR */
+   ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH                = 1 << 2, /* VK_DYNAMIC_STATE_LINE_WIDTH */
+   ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS                = 1 << 3, /* VK_DYNAMIC_STATE_DEPTH_BIAS */
+   ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS           = 1 << 4, /* VK_DYNAMIC_STATE_BLEND_CONSTANTS */
+   ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS              = 1 << 5, /* VK_DYNAMIC_STATE_DEPTH_BOUNDS */
+   ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK      = 1 << 6, /* VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK */
+   ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK        = 1 << 7, /* VK_DYNAMIC_STATE_STENCIL_WRITE_MASK */
+   ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE         = 1 << 8, /* VK_DYNAMIC_STATE_STENCIL_REFERENCE */
+   ANV_CMD_DIRTY_DYNAMIC_ALL                       = (1 << 9) - 1,
+   ANV_CMD_DIRTY_PIPELINE                          = 1 << 9,
+   ANV_CMD_DIRTY_INDEX_BUFFER                      = 1 << 10,
+   ANV_CMD_DIRTY_RENDER_TARGETS                    = 1 << 11,
+};
+typedef uint32_t anv_cmd_dirty_mask_t;
+
+struct anv_vertex_binding {
+   struct anv_buffer *                          buffer;
+   VkDeviceSize                                 offset;
+};
+
+struct anv_push_constants {
+   /* Current allocated size of this push constants data structure.
+    * Because a decent chunk of it may not be used (images on SKL, for
+    * instance), we won't actually allocate the entire structure up-front.
+    */
+   uint32_t size;
+
+   /* Push constant data provided by the client through vkPushConstants */
+   uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE];
+
+   /* Our hardware only provides zero-based vertex and instance id so, in
+    * order to satisfy the vulkan requirements, we may have to push one or
+    * both of these into the shader.
+    */
+   uint32_t base_vertex;
+   uint32_t base_instance;
+
+   /* Offsets and ranges for dynamically bound buffers */
+   struct {
+      uint32_t offset;
+      uint32_t range;
+   } dynamic[MAX_DYNAMIC_BUFFERS];
+
+   /* Image data for image_load_store on pre-SKL */
+   struct brw_image_param images[MAX_IMAGES];
+};
+
+struct anv_dynamic_state {
+   struct {
+      uint32_t                                  count;
+      VkViewport                                viewports[MAX_VIEWPORTS];
+   } viewport;
+
+   struct {
+      uint32_t                                  count;
+      VkRect2D                                  scissors[MAX_SCISSORS];
+   } scissor;
+
+   float                                        line_width;
+
+   struct {
+      float                                     bias;
+      float                                     clamp;
+      float                                     slope;
+   } depth_bias;
+
+   float                                        blend_constants[4];
+
+   struct {
+      float                                     min;
+      float                                     max;
+   } depth_bounds;
+
+   struct {
+      uint32_t                                  front;
+      uint32_t                                  back;
+   } stencil_compare_mask;
+
+   struct {
+      uint32_t                                  front;
+      uint32_t                                  back;
+   } stencil_write_mask;
+
+   struct {
+      uint32_t                                  front;
+      uint32_t                                  back;
+   } stencil_reference;
+};
+
+extern const struct anv_dynamic_state default_dynamic_state;
+
+void anv_dynamic_state_copy(struct anv_dynamic_state *dest,
+                            const struct anv_dynamic_state *src,
+                            uint32_t copy_mask);
+
+/**
+ * Attachment state when recording a renderpass instance.
+ *
+ * The clear value is valid only if there exists a pending clear.
+ */
+struct anv_attachment_state {
+   VkImageAspectFlags                           pending_clear_aspects;
+   VkClearValue                                 clear_value;
+};
+
+/** State required while building cmd buffer */
+struct anv_cmd_state {
+   /* PIPELINE_SELECT.PipelineSelection */
+   uint32_t                                     current_pipeline;
+   uint32_t                                     current_l3_config;
+   uint32_t                                     vb_dirty;
+   anv_cmd_dirty_mask_t                         dirty;
+   anv_cmd_dirty_mask_t                         compute_dirty;
+   uint32_t                                     num_workgroups_offset;
+   struct anv_bo                                *num_workgroups_bo;
+   VkShaderStageFlags                           descriptors_dirty;
+   VkShaderStageFlags                           push_constants_dirty;
+   uint32_t                                     scratch_size;
+   struct anv_pipeline *                        pipeline;
+   struct anv_pipeline *                        compute_pipeline;
+   struct anv_framebuffer *                     framebuffer;
+   struct anv_render_pass *                     pass;
+   struct anv_subpass *                         subpass;
+   uint32_t                                     restart_index;
+   struct anv_vertex_binding                    vertex_bindings[MAX_VBS];
+   struct anv_descriptor_set *                  descriptors[MAX_SETS];
+   struct anv_push_constants *                  push_constants[MESA_SHADER_STAGES];
+   struct anv_state                             binding_tables[MESA_SHADER_STAGES];
+   struct anv_state                             samplers[MESA_SHADER_STAGES];
+   struct anv_dynamic_state                     dynamic;
+   bool                                         need_query_wa;
+
+   /**
+    * Array length is anv_cmd_state::pass::attachment_count. Array content is
+    * valid only when recording a render pass instance.
+    */
+   struct anv_attachment_state *                attachments;
+
+   struct {
+      struct anv_buffer *                       index_buffer;
+      uint32_t                                  index_type; /**< 3DSTATE_INDEX_BUFFER.IndexFormat */
+      uint32_t                                  index_offset;
+   } gen7;
+};
+
+struct anv_cmd_pool {
+   VkAllocationCallbacks                        alloc;
+   struct list_head                             cmd_buffers;
+};
+
+#define ANV_CMD_BUFFER_BATCH_SIZE 8192
+
+enum anv_cmd_buffer_exec_mode {
+   ANV_CMD_BUFFER_EXEC_MODE_PRIMARY,
+   ANV_CMD_BUFFER_EXEC_MODE_EMIT,
+   ANV_CMD_BUFFER_EXEC_MODE_CHAIN,
+   ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN,
+};
+
+struct anv_cmd_buffer {
+   VK_LOADER_DATA                               _loader_data;
+
+   struct anv_device *                          device;
+
+   struct anv_cmd_pool *                        pool;
+   struct list_head                             pool_link;
+
+   struct anv_batch                             batch;
+
+   /* Fields required for the actual chain of anv_batch_bo's.
+    *
+    * These fields are initialized by anv_cmd_buffer_init_batch_bo_chain().
+    */
+   struct list_head                             batch_bos;
+   enum anv_cmd_buffer_exec_mode                exec_mode;
+
+   /* A vector of anv_batch_bo pointers for every batch or surface buffer
+    * referenced by this command buffer
+    *
+    * initialized by anv_cmd_buffer_init_batch_bo_chain()
+    */
+   struct anv_vector                            seen_bbos;
+
+   /* A vector of int32_t's for every block of binding tables.
+    *
+    * initialized by anv_cmd_buffer_init_batch_bo_chain()
+    */
+   struct anv_vector                            bt_blocks;
+   uint32_t                                     bt_next;
+   struct anv_reloc_list                        surface_relocs;
+
+   /* Information needed for execbuf
+    *
+    * These fields are generated by anv_cmd_buffer_prepare_execbuf().
+    */
+   struct {
+      struct drm_i915_gem_execbuffer2           execbuf;
+
+      struct drm_i915_gem_exec_object2 *        objects;
+      uint32_t                                  bo_count;
+      struct anv_bo **                          bos;
+
+      /* Allocated length of the 'objects' and 'bos' arrays */
+      uint32_t                                  array_length;
+
+      bool                                      need_reloc;
+   } execbuf2;
+
+   /* Serial for tracking buffer completion */
+   uint32_t                                     serial;
+
+   /* Stream objects for storing temporary data */
+   struct anv_state_stream                      surface_state_stream;
+   struct anv_state_stream                      dynamic_state_stream;
+
+   VkCommandBufferUsageFlags                    usage_flags;
+   VkCommandBufferLevel                         level;
+
+   struct anv_cmd_state                         state;
+};
+
+VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
+                                  struct anv_cmd_buffer *secondary);
+void anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer);
+
+VkResult anv_cmd_buffer_emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                           unsigned stage, struct anv_state *bt_state);
+VkResult anv_cmd_buffer_emit_samplers(struct anv_cmd_buffer *cmd_buffer,
+                                      unsigned stage, struct anv_state *state);
+uint32_t gen7_cmd_buffer_flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer);
+void gen7_cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
+                                              uint32_t stages);
+
+struct anv_state anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer,
+                                             const void *data, uint32_t size, uint32_t alignment);
+struct anv_state anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer,
+                                              uint32_t *a, uint32_t *b,
+                                              uint32_t dwords, uint32_t alignment);
+
+struct anv_address
+anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer);
+struct anv_state
+anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t entries, uint32_t *state_offset);
+struct anv_state
+anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer);
+struct anv_state
+anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t size, uint32_t alignment);
+
+VkResult
+anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer);
+
+void gen8_cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer);
+void gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer);
+
+void gen7_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer);
+void gen75_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer);
+void gen8_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer);
+void gen9_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer);
+
+void anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer);
+
+void anv_cmd_state_setup_attachments(struct anv_cmd_buffer *cmd_buffer,
+                                     const VkRenderPassBeginInfo *info);
+
+void gen7_cmd_buffer_set_subpass(struct anv_cmd_buffer *cmd_buffer,
+                                   struct anv_subpass *subpass);
+void gen8_cmd_buffer_set_subpass(struct anv_cmd_buffer *cmd_buffer,
+                                   struct anv_subpass *subpass);
+void gen9_cmd_buffer_set_subpass(struct anv_cmd_buffer *cmd_buffer,
+                                   struct anv_subpass *subpass);
+void anv_cmd_buffer_set_subpass(struct anv_cmd_buffer *cmd_buffer,
+                                  struct anv_subpass *subpass);
+
+struct anv_state
+anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
+                              gl_shader_stage stage);
+struct anv_state
+anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer);
+
+void anv_cmd_buffer_clear_subpass(struct anv_cmd_buffer *cmd_buffer);
+
+const struct anv_image_view *
+anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer);
+
+void anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer);
+
+struct anv_fence {
+   struct anv_bo bo;
+   struct drm_i915_gem_execbuffer2 execbuf;
+   struct drm_i915_gem_exec_object2 exec2_objects[1];
+   bool ready;
+};
+
+struct anv_event {
+   uint32_t                                     semaphore;
+   struct anv_state                             state;
+};
+
+struct nir_shader;
+
+struct anv_shader_module {
+   struct nir_shader *                          nir;
+
+   uint32_t                                     size;
+   char                                         data[0];
+};
+
+static inline gl_shader_stage
+vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage)
+{
+   assert(__builtin_popcount(vk_stage) == 1);
+   return ffs(vk_stage) - 1;
+}
+
+static inline VkShaderStageFlagBits
+mesa_to_vk_shader_stage(gl_shader_stage mesa_stage)
+{
+   return (1 << mesa_stage);
+}
+
+#define ANV_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
+
+#define anv_foreach_stage(stage, stage_bits)                         \
+   for (gl_shader_stage stage,                                       \
+        __tmp = (gl_shader_stage)((stage_bits) & ANV_STAGE_MASK);    \
+        stage = __builtin_ffs(__tmp) - 1, __tmp;                     \
+        __tmp &= ~(1 << (stage)))
+
+struct anv_pipeline {
+   struct anv_device *                          device;
+   struct anv_batch                             batch;
+   uint32_t                                     batch_data[512];
+   struct anv_reloc_list                        batch_relocs;
+   uint32_t                                     dynamic_state_mask;
+   struct anv_dynamic_state                     dynamic_state;
+
+   struct anv_pipeline_layout *                 layout;
+   bool                                         use_repclear;
+
+   struct brw_vs_prog_data                      vs_prog_data;
+   struct brw_wm_prog_data                      wm_prog_data;
+   struct brw_gs_prog_data                      gs_prog_data;
+   struct brw_cs_prog_data                      cs_prog_data;
+   bool                                         writes_point_size;
+   struct brw_stage_prog_data *                 prog_data[MESA_SHADER_STAGES];
+   uint32_t                                     scratch_start[MESA_SHADER_STAGES];
+   uint32_t                                     total_scratch;
+   struct {
+      uint32_t                                  vs_start;
+      uint32_t                                  vs_size;
+      uint32_t                                  nr_vs_entries;
+      uint32_t                                  gs_start;
+      uint32_t                                  gs_size;
+      uint32_t                                  nr_gs_entries;
+   } urb;
+
+   VkShaderStageFlags                           active_stages;
+   struct anv_state                             blend_state;
+   uint32_t                                     vs_simd8;
+   uint32_t                                     vs_vec4;
+   uint32_t                                     ps_simd8;
+   uint32_t                                     ps_simd16;
+   uint32_t                                     ps_ksp0;
+   uint32_t                                     ps_ksp2;
+   uint32_t                                     ps_grf_start0;
+   uint32_t                                     ps_grf_start2;
+   uint32_t                                     gs_kernel;
+   uint32_t                                     gs_vertex_count;
+   uint32_t                                     cs_simd;
+
+   uint32_t                                     vb_used;
+   uint32_t                                     binding_stride[MAX_VBS];
+   bool                                         instancing_enable[MAX_VBS];
+   bool                                         primitive_restart;
+   uint32_t                                     topology;
+
+   uint32_t                                     cs_thread_width_max;
+   uint32_t                                     cs_right_mask;
+
+   struct {
+      uint32_t                                  sf[7];
+      uint32_t                                  depth_stencil_state[3];
+   } gen7;
+
+   struct {
+      uint32_t                                  sf[4];
+      uint32_t                                  raster[5];
+      uint32_t                                  wm_depth_stencil[3];
+   } gen8;
+
+   struct {
+      uint32_t                                  wm_depth_stencil[4];
+   } gen9;
+};
+
+struct anv_graphics_pipeline_create_info {
+   /**
+    * If non-negative, overrides the color attachment count of the pipeline's
+    * subpass.
+    */
+   int8_t color_attachment_count;
+
+   bool                                         use_repclear;
+   bool                                         disable_viewport;
+   bool                                         disable_scissor;
+   bool                                         disable_vs;
+   bool                                         use_rectlist;
+};
+
+VkResult
+anv_pipeline_init(struct anv_pipeline *pipeline, struct anv_device *device,
+                  struct anv_pipeline_cache *cache,
+                  const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                  const struct anv_graphics_pipeline_create_info *extra,
+                  const VkAllocationCallbacks *alloc);
+
+VkResult
+anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
+                        struct anv_pipeline_cache *cache,
+                        const VkComputePipelineCreateInfo *info,
+                        struct anv_shader_module *module,
+                        const char *entrypoint,
+                        const VkSpecializationInfo *spec_info);
+
+VkResult
+anv_graphics_pipeline_create(VkDevice device,
+                             VkPipelineCache cache,
+                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                             const struct anv_graphics_pipeline_create_info *extra,
+                             const VkAllocationCallbacks *alloc,
+                             VkPipeline *pPipeline);
+
+VkResult
+gen7_graphics_pipeline_create(VkDevice _device,
+                              struct anv_pipeline_cache *cache,
+                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                              const struct anv_graphics_pipeline_create_info *extra,
+                              const VkAllocationCallbacks *alloc,
+                              VkPipeline *pPipeline);
+
+VkResult
+gen75_graphics_pipeline_create(VkDevice _device,
+                               struct anv_pipeline_cache *cache,
+                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                               const struct anv_graphics_pipeline_create_info *extra,
+                               const VkAllocationCallbacks *alloc,
+                               VkPipeline *pPipeline);
+
+VkResult
+gen8_graphics_pipeline_create(VkDevice _device,
+                              struct anv_pipeline_cache *cache,
+                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                              const struct anv_graphics_pipeline_create_info *extra,
+                              const VkAllocationCallbacks *alloc,
+                              VkPipeline *pPipeline);
+VkResult
+gen9_graphics_pipeline_create(VkDevice _device,
+                              struct anv_pipeline_cache *cache,
+                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                              const struct anv_graphics_pipeline_create_info *extra,
+                              const VkAllocationCallbacks *alloc,
+                              VkPipeline *pPipeline);
+VkResult
+gen7_compute_pipeline_create(VkDevice _device,
+                             struct anv_pipeline_cache *cache,
+                             const VkComputePipelineCreateInfo *pCreateInfo,
+                             const VkAllocationCallbacks *alloc,
+                             VkPipeline *pPipeline);
+VkResult
+gen75_compute_pipeline_create(VkDevice _device,
+                              struct anv_pipeline_cache *cache,
+                              const VkComputePipelineCreateInfo *pCreateInfo,
+                              const VkAllocationCallbacks *alloc,
+                              VkPipeline *pPipeline);
+
+VkResult
+gen8_compute_pipeline_create(VkDevice _device,
+                             struct anv_pipeline_cache *cache,
+                             const VkComputePipelineCreateInfo *pCreateInfo,
+                             const VkAllocationCallbacks *alloc,
+                             VkPipeline *pPipeline);
+VkResult
+gen9_compute_pipeline_create(VkDevice _device,
+                             struct anv_pipeline_cache *cache,
+                             const VkComputePipelineCreateInfo *pCreateInfo,
+                             const VkAllocationCallbacks *alloc,
+                             VkPipeline *pPipeline);
+
+struct anv_format {
+   const VkFormat vk_format;
+   const char *name;
+   enum isl_format surface_format; /**< RENDER_SURFACE_STATE.SurfaceFormat */
+   const struct isl_format_layout *isl_layout;
+   uint16_t depth_format; /**< 3DSTATE_DEPTH_BUFFER.SurfaceFormat */
+   bool has_stencil;
+};
+
+const struct anv_format *
+anv_format_for_vk_format(VkFormat format);
+
+enum isl_format
+anv_get_isl_format(VkFormat format, VkImageAspectFlags aspect,
+                   VkImageTiling tiling);
+
+static inline bool
+anv_format_is_color(const struct anv_format *format)
+{
+   return !format->depth_format && !format->has_stencil;
+}
+
+static inline bool
+anv_format_is_depth_or_stencil(const struct anv_format *format)
+{
+   return format->depth_format || format->has_stencil;
+}
+
+/**
+ * Subsurface of an anv_image.
+ */
+struct anv_surface {
+   struct isl_surf isl;
+
+   /**
+    * Offset from VkImage's base address, as bound by vkBindImageMemory().
+    */
+   uint32_t offset;
+};
+
+struct anv_image {
+   VkImageType type;
+   /* The original VkFormat provided by the client.  This may not match any
+    * of the actual surface formats.
+    */
+   VkFormat vk_format;
+   const struct anv_format *format;
+   VkExtent3D extent;
+   uint32_t levels;
+   uint32_t array_size;
+   VkImageUsageFlags usage; /**< Superset of VkImageCreateInfo::usage. */
+   VkImageTiling tiling; /** VkImageCreateInfo::tiling */
+
+   VkDeviceSize size;
+   uint32_t alignment;
+
+   /* Set when bound */
+   struct anv_bo *bo;
+   VkDeviceSize offset;
+
+   bool needs_nonrt_surface_state:1;
+   bool needs_color_rt_surface_state:1;
+   bool needs_storage_surface_state:1;
+
+   /**
+    * Image subsurfaces
+    *
+    * For each foo, anv_image::foo_surface is valid if and only if
+    * anv_image::format has a foo aspect.
+    *
+    * The hardware requires that the depth buffer and stencil buffer be
+    * separate surfaces.  From Vulkan's perspective, though, depth and stencil
+    * reside in the same VkImage.  To satisfy both the hardware and Vulkan, we
+    * allocate the depth and stencil buffers as separate surfaces in the same
+    * bo.
+    */
+   union {
+      struct anv_surface color_surface;
+
+      struct {
+         struct anv_surface depth_surface;
+         struct anv_surface stencil_surface;
+      };
+   };
+};
+
+struct anv_image_view {
+   const struct anv_image *image; /**< VkImageViewCreateInfo::image */
+   struct anv_bo *bo;
+   uint32_t offset; /**< Offset into bo. */
+
+   VkImageAspectFlags aspect_mask;
+   VkFormat vk_format;
+   enum isl_format format;
+   VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */
+
+   /** RENDER_SURFACE_STATE when using image as a color render target. */
+   struct anv_state color_rt_surface_state;
+
+   /** RENDER_SURFACE_STATE when using image as a non render target. */
+   struct anv_state nonrt_surface_state;
+
+   /** RENDER_SURFACE_STATE when using image as a storage image. */
+   struct anv_state storage_surface_state;
+};
+
+struct anv_image_create_info {
+   const VkImageCreateInfo *vk_info;
+   isl_tiling_flags_t isl_tiling_flags;
+   uint32_t stride;
+};
+
+VkResult anv_image_create(VkDevice _device,
+                          const struct anv_image_create_info *info,
+                          const VkAllocationCallbacks* alloc,
+                          VkImage *pImage);
+
+struct anv_surface *
+anv_image_get_surface_for_aspect_mask(struct anv_image *image,
+                                      VkImageAspectFlags aspect_mask);
+
+void anv_image_view_init(struct anv_image_view *view,
+                         struct anv_device *device,
+                         const VkImageViewCreateInfo* pCreateInfo,
+                         struct anv_cmd_buffer *cmd_buffer);
+
+void
+gen7_image_view_init(struct anv_image_view *iview,
+                     struct anv_device *device,
+                     const VkImageViewCreateInfo* pCreateInfo,
+                     struct anv_cmd_buffer *cmd_buffer);
+
+void
+gen75_image_view_init(struct anv_image_view *iview,
+                      struct anv_device *device,
+                      const VkImageViewCreateInfo* pCreateInfo,
+                      struct anv_cmd_buffer *cmd_buffer);
+
+void
+gen8_image_view_init(struct anv_image_view *iview,
+                     struct anv_device *device,
+                     const VkImageViewCreateInfo* pCreateInfo,
+                     struct anv_cmd_buffer *cmd_buffer);
+
+void
+gen9_image_view_init(struct anv_image_view *iview,
+                     struct anv_device *device,
+                     const VkImageViewCreateInfo* pCreateInfo,
+                     struct anv_cmd_buffer *cmd_buffer);
+
+struct anv_buffer_view {
+   enum isl_format format; /**< VkBufferViewCreateInfo::format */
+   struct anv_bo *bo;
+   uint32_t offset; /**< Offset into bo. */
+   uint64_t range; /**< VkBufferViewCreateInfo::range */
+
+   struct anv_state surface_state;
+   struct anv_state storage_surface_state;
+};
+
+const struct anv_format *
+anv_format_for_descriptor_type(VkDescriptorType type);
+
+void anv_fill_buffer_surface_state(struct anv_device *device, void *state,
+                                   enum isl_format format,
+                                   uint32_t offset, uint32_t range,
+                                   uint32_t stride);
+
+void gen7_fill_buffer_surface_state(void *state, enum isl_format format,
+                                    uint32_t offset, uint32_t range,
+                                    uint32_t stride);
+void gen75_fill_buffer_surface_state(void *state, enum isl_format format,
+                                     uint32_t offset, uint32_t range,
+                                     uint32_t stride);
+void gen8_fill_buffer_surface_state(void *state, enum isl_format format,
+                                    uint32_t offset, uint32_t range,
+                                    uint32_t stride);
+void gen9_fill_buffer_surface_state(void *state, enum isl_format format,
+                                    uint32_t offset, uint32_t range,
+                                    uint32_t stride);
+
+void anv_image_view_fill_image_param(struct anv_device *device,
+                                     struct anv_image_view *view,
+                                     struct brw_image_param *param);
+void anv_buffer_view_fill_image_param(struct anv_device *device,
+                                      struct anv_buffer_view *view,
+                                      struct brw_image_param *param);
+
+struct anv_sampler {
+   uint32_t state[4];
+};
+
+struct anv_framebuffer {
+   uint32_t                                     width;
+   uint32_t                                     height;
+   uint32_t                                     layers;
+
+   uint32_t                                     attachment_count;
+   const struct anv_image_view *           attachments[0];
+};
+
+struct anv_subpass {
+   uint32_t                                     input_count;
+   uint32_t *                                   input_attachments;
+   uint32_t                                     color_count;
+   uint32_t *                                   color_attachments;
+   uint32_t *                                   resolve_attachments;
+   uint32_t                                     depth_stencil_attachment;
+};
+
+struct anv_render_pass_attachment {
+   const struct anv_format                      *format;
+   uint32_t                                     samples;
+   VkAttachmentLoadOp                           load_op;
+   VkAttachmentLoadOp                           stencil_load_op;
+};
+
+struct anv_render_pass {
+   uint32_t                                     attachment_count;
+   uint32_t                                     subpass_count;
+   uint32_t *                                   subpass_attachments;
+   struct anv_render_pass_attachment *          attachments;
+   struct anv_subpass                           subpasses[0];
+};
+
+extern struct anv_render_pass anv_meta_dummy_renderpass;
+
+struct anv_query_pool_slot {
+   uint64_t begin;
+   uint64_t end;
+   uint64_t available;
+};
+
+struct anv_query_pool {
+   VkQueryType                                  type;
+   uint32_t                                     slots;
+   struct anv_bo                                bo;
+};
+
+VkResult anv_device_init_meta(struct anv_device *device);
+void anv_device_finish_meta(struct anv_device *device);
+
+void *anv_lookup_entrypoint(const char *name);
+
+void anv_dump_image_to_ppm(struct anv_device *device,
+                           struct anv_image *image, unsigned miplevel,
+                           unsigned array_layer, const char *filename);
+
+#define ANV_DEFINE_HANDLE_CASTS(__anv_type, __VkType)                      \
+                                                                           \
+   static inline struct __anv_type *                                       \
+   __anv_type ## _from_handle(__VkType _handle)                            \
+   {                                                                       \
+      return (struct __anv_type *) _handle;                                \
+   }                                                                       \
+                                                                           \
+   static inline __VkType                                                  \
+   __anv_type ## _to_handle(struct __anv_type *_obj)                       \
+   {                                                                       \
+      return (__VkType) _obj;                                              \
+   }
+
+#define ANV_DEFINE_NONDISP_HANDLE_CASTS(__anv_type, __VkType)              \
+                                                                           \
+   static inline struct __anv_type *                                       \
+   __anv_type ## _from_handle(__VkType _handle)                            \
+   {                                                                       \
+      return (struct __anv_type *)(uintptr_t) _handle;                     \
+   }                                                                       \
+                                                                           \
+   static inline __VkType                                                  \
+   __anv_type ## _to_handle(struct __anv_type *_obj)                       \
+   {                                                                       \
+      return (__VkType)(uintptr_t) _obj;                                   \
+   }
+
+#define ANV_FROM_HANDLE(__anv_type, __name, __handle) \
+   struct __anv_type *__name = __anv_type ## _from_handle(__handle)
+
+ANV_DEFINE_HANDLE_CASTS(anv_cmd_buffer, VkCommandBuffer)
+ANV_DEFINE_HANDLE_CASTS(anv_device, VkDevice)
+ANV_DEFINE_HANDLE_CASTS(anv_instance, VkInstance)
+ANV_DEFINE_HANDLE_CASTS(anv_physical_device, VkPhysicalDevice)
+ANV_DEFINE_HANDLE_CASTS(anv_queue, VkQueue)
+
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_cmd_pool, VkCommandPool)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer, VkBuffer)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer_view, VkBufferView)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set, VkDescriptorSet)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set_layout, VkDescriptorSetLayout)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_device_memory, VkDeviceMemory)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_fence, VkFence)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_event, VkEvent)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_framebuffer, VkFramebuffer)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_image, VkImage)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_image_view, VkImageView);
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_cache, VkPipelineCache)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline, VkPipeline)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_layout, VkPipelineLayout)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, VkQueryPool)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_render_pass, VkRenderPass)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, VkSampler)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_shader_module, VkShaderModule)
+
+#define ANV_DEFINE_STRUCT_CASTS(__anv_type, __VkType) \
+   \
+   static inline const __VkType * \
+   __anv_type ## _to_ ## __VkType(const struct __anv_type *__anv_obj) \
+   { \
+      return (const __VkType *) __anv_obj; \
+   }
+
+#define ANV_COMMON_TO_STRUCT(__VkType, __vk_name, __common_name) \
+   const __VkType *__vk_name = anv_common_to_ ## __VkType(__common_name)
+
+ANV_DEFINE_STRUCT_CASTS(anv_common, VkMemoryBarrier)
+ANV_DEFINE_STRUCT_CASTS(anv_common, VkBufferMemoryBarrier)
+ANV_DEFINE_STRUCT_CASTS(anv_common, VkImageMemoryBarrier)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/vulkan/anv_query.c b/src/vulkan/anv_query.c

new file mode 100644 (file)

index 0000000..5b05234
--- /dev/null
+++ b/src/vulkan/anv_query.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+VkResult anv_CreateQueryPool(
+    VkDevice                                    _device,
+    const VkQueryPoolCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkQueryPool*                                pQueryPool)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_query_pool *pool;
+   VkResult result;
+   uint32_t slot_size;
+   uint64_t size;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
+
+   switch (pCreateInfo->queryType) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+      break;
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+      return VK_ERROR_INCOMPATIBLE_DRIVER;
+   default:
+      assert(!"Invalid query type");
+   }
+
+   slot_size = sizeof(struct anv_query_pool_slot);
+   pool = anv_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pool == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   pool->type = pCreateInfo->queryType;
+   pool->slots = pCreateInfo->queryCount;
+
+   size = pCreateInfo->queryCount * slot_size;
+   result = anv_bo_init_new(&pool->bo, device, size);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
+
+   *pQueryPool = anv_query_pool_to_handle(pool);
+
+   return VK_SUCCESS;
+
+ fail:
+   anv_free2(&device->alloc, pAllocator, pool);
+
+   return result;
+}
+
+void anv_DestroyQueryPool(
+    VkDevice                                    _device,
+    VkQueryPool                                 _pool,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
+
+   anv_gem_munmap(pool->bo.map, pool->bo.size);
+   anv_gem_close(device, pool->bo.gem_handle);
+   anv_free2(&device->alloc, pAllocator, pool);
+}
+
+VkResult anv_GetQueryPoolResults(
+    VkDevice                                    _device,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    size_t                                      dataSize,
+    void*                                       pData,
+    VkDeviceSize                                stride,
+    VkQueryResultFlags                          flags)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   int64_t timeout = INT64_MAX;
+   uint64_t result;
+   int ret;
+
+   assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
+          pool->type == VK_QUERY_TYPE_TIMESTAMP);
+
+   if (pData == NULL)
+      return VK_SUCCESS;
+
+   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+      ret = anv_gem_wait(device, pool->bo.gem_handle, &timeout);
+      if (ret == -1) {
+         /* We don't know the real error. */
+         return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "gem_wait failed %m");
+      }
+   }
+
+   void *data_end = pData + dataSize;
+   struct anv_query_pool_slot *slot = pool->bo.map;
+
+   for (uint32_t i = 0; i < queryCount; i++) {
+      switch (pool->type) {
+      case VK_QUERY_TYPE_OCCLUSION: {
+         result = slot[firstQuery + i].end - slot[firstQuery + i].begin;
+         break;
+      }
+      case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+         /* Not yet implemented */
+         break;
+      case VK_QUERY_TYPE_TIMESTAMP: {
+         result = slot[firstQuery + i].begin;
+         break;
+      }
+      default:
+         assert(!"Invalid query type");
+      }
+
+      if (flags & VK_QUERY_RESULT_64_BIT) {
+         uint64_t *dst = pData;
+         dst[0] = result;
+         if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+            dst[1] = slot[firstQuery + i].available;
+      } else {
+         uint32_t *dst = pData;
+         if (result > UINT32_MAX)
+            result = UINT32_MAX;
+         dst[0] = result;
+         if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+            dst[1] = slot[firstQuery + i].available;
+      }
+
+      pData += stride;
+      if (pData >= data_end)
+         break;
+   }
+
+   return VK_SUCCESS;
+}
+
+void anv_CmdResetQueryPool(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount)
+{
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+   for (uint32_t i = 0; i < queryCount; i++) {
+      switch (pool->type) {
+      case VK_QUERY_TYPE_OCCLUSION:
+      case VK_QUERY_TYPE_TIMESTAMP: {
+         struct anv_query_pool_slot *slot = pool->bo.map;
+         slot[firstQuery + i].available = 0;
+         break;
+      }
+      default:
+         assert(!"Invalid query type");
+      }
+   }
+}
diff --git a/src/vulkan/anv_util.c b/src/vulkan/anv_util.c

new file mode 100644 (file)

index 0000000..22fd01c
--- /dev/null
+++ b/src/vulkan/anv_util.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "anv_private.h"
+
+/** Log an error message.  */
+void anv_printflike(1, 2)
+anv_loge(const char *format, ...)
+{
+   va_list va;
+
+   va_start(va, format);
+   anv_loge_v(format, va);
+   va_end(va);
+}
+
+/** \see anv_loge() */
+void
+anv_loge_v(const char *format, va_list va)
+{
+   fprintf(stderr, "vk: error: ");
+   vfprintf(stderr, format, va);
+   fprintf(stderr, "\n");
+}
+
+void anv_printflike(3, 4)
+__anv_finishme(const char *file, int line, const char *format, ...)
+{
+   va_list ap;
+   char buffer[256];
+
+   va_start(ap, format);
+   vsnprintf(buffer, sizeof(buffer), format, ap);
+   va_end(ap);
+
+   fprintf(stderr, "%s:%d: FINISHME: %s\n", file, line, buffer);
+}
+
+void anv_noreturn anv_printflike(1, 2)
+anv_abortf(const char *format, ...)
+{
+   va_list va;
+
+   va_start(va, format);
+   anv_abortfv(format, va);
+   va_end(va);
+}
+
+void anv_noreturn
+anv_abortfv(const char *format, va_list va)
+{
+   fprintf(stderr, "vk: error: ");
+   vfprintf(stderr, format, va);
+   fprintf(stderr, "\n");
+   abort();
+}
+
+VkResult
+__vk_errorf(VkResult error, const char *file, int line, const char *format, ...)
+{
+   va_list ap;
+   char buffer[256];
+
+#define ERROR_CASE(error) case error: error_str = #error; break;
+
+   const char *error_str;
+   switch ((int32_t)error) {
+
+   /* Core errors */
+   ERROR_CASE(VK_ERROR_OUT_OF_HOST_MEMORY)
+   ERROR_CASE(VK_ERROR_OUT_OF_DEVICE_MEMORY)
+   ERROR_CASE(VK_ERROR_INITIALIZATION_FAILED)
+   ERROR_CASE(VK_ERROR_DEVICE_LOST)
+   ERROR_CASE(VK_ERROR_MEMORY_MAP_FAILED)
+   ERROR_CASE(VK_ERROR_LAYER_NOT_PRESENT)
+   ERROR_CASE(VK_ERROR_EXTENSION_NOT_PRESENT)
+   ERROR_CASE(VK_ERROR_INCOMPATIBLE_DRIVER)
+
+   /* Extension errors */
+   ERROR_CASE(VK_ERROR_OUT_OF_DATE_KHR)
+
+   default:
+      assert(!"Unknown error");
+      error_str = "unknown error";
+   }
+
+#undef ERROR_CASE
+
+   if (format) {
+      va_start(ap, format);
+      vsnprintf(buffer, sizeof(buffer), format, ap);
+      va_end(ap);
+
+      fprintf(stderr, "%s:%d: %s (%s)\n", file, line, buffer, error_str);
+   } else {
+      fprintf(stderr, "%s:%d: %s\n", file, line, error_str);
+   }
+
+   return error;
+}
+
+int
+anv_vector_init(struct anv_vector *vector, uint32_t element_size, uint32_t size)
+{
+   assert(util_is_power_of_two(size));
+   assert(element_size < size && util_is_power_of_two(element_size));
+
+   vector->head = 0;
+   vector->tail = 0;
+   vector->element_size = element_size;
+   vector->size = size;
+   vector->data = malloc(size);
+
+   return vector->data != NULL;
+}
+
+void *
+anv_vector_add(struct anv_vector *vector)
+{
+   uint32_t offset, size, split, tail;
+   void *data;
+
+   if (vector->head - vector->tail == vector->size) {
+      size = vector->size * 2;
+      data = malloc(size);
+      if (data == NULL)
+         return NULL;
+      split = align_u32(vector->tail, vector->size);
+      tail = vector->tail & (vector->size - 1);
+      if (vector->head - split < vector->size) {
+         memcpy(data + tail,
+                vector->data + tail,
+                split - vector->tail);
+         memcpy(data + vector->size,
+                vector->data, vector->head - split);
+      } else {
+         memcpy(data + tail,
+                vector->data + tail,
+                vector->head - vector->tail);
+      }
+      free(vector->data);
+      vector->data = data;
+      vector->size = size;
+   }
+
+   assert(vector->head - vector->tail < vector->size);
+
+   offset = vector->head & (vector->size - 1);
+   vector->head += vector->element_size;
+
+   return vector->data + offset;
+}
+
+void *
+anv_vector_remove(struct anv_vector *vector)
+{
+   uint32_t offset;
+
+   if (vector->head == vector->tail)
+      return NULL;
+
+   assert(vector->head - vector->tail <= vector->size);
+
+   offset = vector->tail & (vector->size - 1);
+   vector->tail += vector->element_size;
+
+   return vector->data + offset;
+}
diff --git a/src/vulkan/anv_wsi.c b/src/vulkan/anv_wsi.c

new file mode 100644 (file)

index 0000000..c181cd4
--- /dev/null
+++ b/src/vulkan/anv_wsi.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_wsi.h"
+
+VkResult
+anv_init_wsi(struct anv_instance *instance)
+{
+   VkResult result;
+
+   result = anv_x11_init_wsi(instance);
+   if (result != VK_SUCCESS)
+      return result;
+
+#ifdef HAVE_WAYLAND_PLATFORM
+   result = anv_wl_init_wsi(instance);
+   if (result != VK_SUCCESS) {
+      anv_x11_finish_wsi(instance);
+      return result;
+   }
+#endif
+
+   return VK_SUCCESS;
+}
+
+void
+anv_finish_wsi(struct anv_instance *instance)
+{
+#ifdef HAVE_WAYLAND_PLATFORM
+   anv_wl_finish_wsi(instance);
+#endif
+   anv_x11_finish_wsi(instance);
+}
+
+void anv_DestroySurfaceKHR(
+    VkInstance                                   instance,
+    VkSurfaceKHR                                 _surface,
+    const VkAllocationCallbacks*                 pAllocator)
+{
+   ANV_FROM_HANDLE(anv_wsi_surface, surface, _surface);
+
+   surface->destroy(surface, pAllocator);
+}
+
+VkResult anv_GetPhysicalDeviceSurfaceSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    VkSurfaceKHR                                _surface,
+    VkBool32*                                   pSupported)
+{
+   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+   ANV_FROM_HANDLE(anv_wsi_surface, surface, _surface);
+
+   return surface->get_support(surface, device, queueFamilyIndex, pSupported);
+}
+
+VkResult anv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                _surface,
+    VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities)
+{
+   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+   ANV_FROM_HANDLE(anv_wsi_surface, surface, _surface);
+
+   return surface->get_capabilities(surface, device, pSurfaceCapabilities);
+}
+
+VkResult anv_GetPhysicalDeviceSurfaceFormatsKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                _surface,
+    uint32_t*                                   pSurfaceFormatCount,
+    VkSurfaceFormatKHR*                         pSurfaceFormats)
+{
+   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+   ANV_FROM_HANDLE(anv_wsi_surface, surface, _surface);
+
+   return surface->get_formats(surface, device, pSurfaceFormatCount,
+                               pSurfaceFormats);
+}
+
+VkResult anv_GetPhysicalDeviceSurfacePresentModesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkSurfaceKHR                                _surface,
+    uint32_t*                                   pPresentModeCount,
+    VkPresentModeKHR*                           pPresentModes)
+{
+   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+   ANV_FROM_HANDLE(anv_wsi_surface, surface, _surface);
+
+   return surface->get_present_modes(surface, device, pPresentModeCount,
+                                     pPresentModes);
+}
+
+VkResult anv_CreateSwapchainKHR(
+    VkDevice                                     _device,
+    const VkSwapchainCreateInfoKHR*              pCreateInfo,
+    const VkAllocationCallbacks*                 pAllocator,
+    VkSwapchainKHR*                              pSwapchain)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_wsi_surface, surface, pCreateInfo->surface);
+   struct anv_swapchain *swapchain;
+
+   VkResult result = surface->create_swapchain(surface, device, pCreateInfo,
+                                               pAllocator, &swapchain);
+   if (result != VK_SUCCESS)
+      return result;
+
+   *pSwapchain = anv_swapchain_to_handle(swapchain);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroySwapchainKHR(
+    VkDevice                                     device,
+    VkSwapchainKHR                               _swapchain,
+    const VkAllocationCallbacks*                 pAllocator)
+{
+   ANV_FROM_HANDLE(anv_swapchain, swapchain, _swapchain);
+
+   swapchain->destroy(swapchain, pAllocator);
+}
+
+VkResult anv_GetSwapchainImagesKHR(
+    VkDevice                                     device,
+    VkSwapchainKHR                               _swapchain,
+    uint32_t*                                    pSwapchainImageCount,
+    VkImage*                                     pSwapchainImages)
+{
+   ANV_FROM_HANDLE(anv_swapchain, swapchain, _swapchain);
+
+   return swapchain->get_images(swapchain, pSwapchainImageCount,
+                                pSwapchainImages);
+}
+
+VkResult anv_AcquireNextImageKHR(
+    VkDevice                                     device,
+    VkSwapchainKHR                               _swapchain,
+    uint64_t                                     timeout,
+    VkSemaphore                                  semaphore,
+    VkFence                                      fence,
+    uint32_t*                                    pImageIndex)
+{
+   ANV_FROM_HANDLE(anv_swapchain, swapchain, _swapchain);
+
+   return swapchain->acquire_next_image(swapchain, timeout, semaphore,
+                                        pImageIndex);
+}
+
+VkResult anv_QueuePresentKHR(
+    VkQueue                                  _queue,
+    const VkPresentInfoKHR*                  pPresentInfo)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   VkResult result;
+
+   for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
+      ANV_FROM_HANDLE(anv_swapchain, swapchain, pPresentInfo->pSwapchains[i]);
+
+      assert(swapchain->device == queue->device);
+
+      result = swapchain->queue_present(swapchain, queue,
+                                        pPresentInfo->pImageIndices[i]);
+      /* TODO: What if one of them returns OUT_OF_DATE? */
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
diff --git a/src/vulkan/anv_wsi.h b/src/vulkan/anv_wsi.h

new file mode 100644 (file)

index 0000000..15b3f86
--- /dev/null
+++ b/src/vulkan/anv_wsi.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "anv_private.h"
+
+struct anv_swapchain;
+
+struct anv_wsi_surface {
+   struct anv_instance *instance;
+
+   void (*destroy)(struct anv_wsi_surface *surface,
+                   const VkAllocationCallbacks *pAllocator);
+   VkResult (*get_support)(struct anv_wsi_surface *surface,
+                           struct anv_physical_device *device,
+                           uint32_t queueFamilyIndex,
+                           VkBool32* pSupported);
+   VkResult (*get_capabilities)(struct anv_wsi_surface *surface,
+                                struct anv_physical_device *device,
+                                VkSurfaceCapabilitiesKHR* pSurfaceCapabilities);
+   VkResult (*get_formats)(struct anv_wsi_surface *surface,
+                           struct anv_physical_device *device,
+                           uint32_t* pSurfaceFormatCount,
+                           VkSurfaceFormatKHR* pSurfaceFormats);
+   VkResult (*get_present_modes)(struct anv_wsi_surface *surface,
+                                 struct anv_physical_device *device,
+                                 uint32_t* pPresentModeCount,
+                                 VkPresentModeKHR* pPresentModes);
+   VkResult (*create_swapchain)(struct anv_wsi_surface *surface,
+                                struct anv_device *device,
+                                const VkSwapchainCreateInfoKHR* pCreateInfo,
+                                const VkAllocationCallbacks* pAllocator,
+                                struct anv_swapchain **swapchain);
+};
+
+struct anv_swapchain {
+   struct anv_device *device;
+
+   VkResult (*destroy)(struct anv_swapchain *swapchain,
+                       const VkAllocationCallbacks *pAllocator);
+   VkResult (*get_images)(struct anv_swapchain *swapchain,
+                          uint32_t *pCount, VkImage *pSwapchainImages);
+   VkResult (*acquire_next_image)(struct anv_swapchain *swap_chain,
+                                  uint64_t timeout, VkSemaphore semaphore,
+                                  uint32_t *image_index);
+   VkResult (*queue_present)(struct anv_swapchain *swap_chain,
+                             struct anv_queue *queue,
+                             uint32_t image_index);
+};
+
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_wsi_surface, VkSurfaceKHR)
+ANV_DEFINE_NONDISP_HANDLE_CASTS(anv_swapchain, VkSwapchainKHR)
+
+VkResult anv_x11_init_wsi(struct anv_instance *instance);
+void anv_x11_finish_wsi(struct anv_instance *instance);
+VkResult anv_wl_init_wsi(struct anv_instance *instance);
+void anv_wl_finish_wsi(struct anv_instance *instance);
diff --git a/src/vulkan/anv_wsi_wayland.c b/src/vulkan/anv_wsi_wayland.c

new file mode 100644 (file)

index 0000000..5e8a3a5
--- /dev/null
+++ b/src/vulkan/anv_wsi_wayland.c
@@ -0,0 +1,877 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <wayland-client.h>
+#include <wayland-drm-client-protocol.h>
+
+#include "anv_wsi.h"
+
+#include <util/hash_table.h>
+
+#define MIN_NUM_IMAGES 2
+
+struct wsi_wl_surface {
+   struct anv_wsi_surface base;
+
+   struct wl_display *display;
+   struct wl_surface *surface;
+};
+
+struct wsi_wl_display {
+   struct wl_display *                          display;
+   struct wl_drm *                              drm;
+
+   /* Vector of VkFormats supported */
+   struct anv_vector                            formats;
+
+   uint32_t                                     capabilities;
+};
+
+struct wsi_wayland {
+   struct anv_instance *                        instance;
+
+   pthread_mutex_t                              mutex;
+   /* Hash table of wl_display -> wsi_wl_display mappings */
+   struct hash_table *                          displays;
+};
+
+static void
+wsi_wl_display_add_vk_format(struct wsi_wl_display *display, VkFormat format)
+{
+   /* Don't add a format that's already in the list */
+   VkFormat *f;
+   anv_vector_foreach(f, &display->formats)
+      if (*f == format)
+         return;
+
+   /* Don't add formats which aren't supported by the driver */
+   if (anv_format_for_vk_format(format)->surface_format ==
+       ISL_FORMAT_UNSUPPORTED) {
+      return;
+   }
+
+   f = anv_vector_add(&display->formats);
+   if (f)
+      *f = format;
+}
+
+static void
+drm_handle_device(void *data, struct wl_drm *drm, const char *name)
+{
+   fprintf(stderr, "wl_drm.device(%s)\n", name);
+}
+
+static uint32_t
+wl_drm_format_for_vk_format(VkFormat vk_format, bool alpha)
+{
+   switch (vk_format) {
+   /* TODO: Figure out what all the formats mean and make this table
+    * correct.
+    */
+#if 0
+   case VK_FORMAT_R4G4B4A4_UNORM:
+      return alpha ? WL_DRM_FORMAT_ABGR4444 : WL_DRM_FORMAT_XBGR4444;
+   case VK_FORMAT_R5G6B5_UNORM:
+      return WL_DRM_FORMAT_BGR565;
+   case VK_FORMAT_R5G5B5A1_UNORM:
+      return alpha ? WL_DRM_FORMAT_ABGR1555 : WL_DRM_FORMAT_XBGR1555;
+   case VK_FORMAT_R8G8B8_UNORM:
+      return WL_DRM_FORMAT_XBGR8888;
+   case VK_FORMAT_R8G8B8A8_UNORM:
+      return alpha ? WL_DRM_FORMAT_ABGR8888 : WL_DRM_FORMAT_XBGR8888;
+   case VK_FORMAT_R10G10B10A2_UNORM:
+      return alpha ? WL_DRM_FORMAT_ABGR2101010 : WL_DRM_FORMAT_XBGR2101010;
+   case VK_FORMAT_B4G4R4A4_UNORM:
+      return alpha ? WL_DRM_FORMAT_ARGB4444 : WL_DRM_FORMAT_XRGB4444;
+   case VK_FORMAT_B5G6R5_UNORM:
+      return WL_DRM_FORMAT_RGB565;
+   case VK_FORMAT_B5G5R5A1_UNORM:
+      return alpha ? WL_DRM_FORMAT_XRGB1555 : WL_DRM_FORMAT_XRGB1555;
+#endif
+   case VK_FORMAT_B8G8R8_UNORM:
+      return WL_DRM_FORMAT_BGRX8888;
+   case VK_FORMAT_B8G8R8A8_UNORM:
+      return alpha ? WL_DRM_FORMAT_ARGB8888 : WL_DRM_FORMAT_XRGB8888;
+#if 0
+   case VK_FORMAT_B10G10R10A2_UNORM:
+      return alpha ? WL_DRM_FORMAT_ARGB2101010 : WL_DRM_FORMAT_XRGB2101010;
+#endif
+
+   default:
+      assert("!Unsupported Vulkan format");
+      return 0;
+   }
+}
+
+static void
+drm_handle_format(void *data, struct wl_drm *drm, uint32_t wl_format)
+{
+   struct wsi_wl_display *display = data;
+
+   switch (wl_format) {
+#if 0
+   case WL_DRM_FORMAT_ABGR4444:
+   case WL_DRM_FORMAT_XBGR4444:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_R4G4B4A4_UNORM);
+      break;
+   case WL_DRM_FORMAT_BGR565:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_R5G6B5_UNORM);
+      break;
+   case WL_DRM_FORMAT_ABGR1555:
+   case WL_DRM_FORMAT_XBGR1555:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_R5G5B5A1_UNORM);
+      break;
+   case WL_DRM_FORMAT_XBGR8888:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_R8G8B8_UNORM);
+      /* fallthrough */
+   case WL_DRM_FORMAT_ABGR8888:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_R8G8B8A8_UNORM);
+      break;
+   case WL_DRM_FORMAT_ABGR2101010:
+   case WL_DRM_FORMAT_XBGR2101010:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_R10G10B10A2_UNORM);
+      break;
+   case WL_DRM_FORMAT_ARGB4444:
+   case WL_DRM_FORMAT_XRGB4444:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_B4G4R4A4_UNORM);
+      break;
+   case WL_DRM_FORMAT_RGB565:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_B5G6R5_UNORM);
+      break;
+   case WL_DRM_FORMAT_ARGB1555:
+   case WL_DRM_FORMAT_XRGB1555:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_B5G5R5A1_UNORM);
+      break;
+#endif
+   case WL_DRM_FORMAT_XRGB8888:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_B8G8R8_UNORM);
+      /* fallthrough */
+   case WL_DRM_FORMAT_ARGB8888:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_B8G8R8A8_UNORM);
+      break;
+#if 0
+   case WL_DRM_FORMAT_ARGB2101010:
+   case WL_DRM_FORMAT_XRGB2101010:
+      wsi_wl_display_add_vk_format(display, VK_FORMAT_B10G10R10A2_UNORM);
+      break;
+#endif
+   }
+}
+
+static void
+drm_handle_authenticated(void *data, struct wl_drm *drm)
+{
+}
+
+static void
+drm_handle_capabilities(void *data, struct wl_drm *drm, uint32_t capabilities)
+{
+   struct wsi_wl_display *display = data;
+
+   display->capabilities = capabilities;
+}
+
+static const struct wl_drm_listener drm_listener = {
+   drm_handle_device,
+   drm_handle_format,
+   drm_handle_authenticated,
+   drm_handle_capabilities,
+};
+
+static void
+registry_handle_global(void *data, struct wl_registry *registry,
+                       uint32_t name, const char *interface, uint32_t version)
+{
+   struct wsi_wl_display *display = data;
+
+   if (strcmp(interface, "wl_drm") == 0) {
+      assert(display->drm == NULL);
+
+      assert(version >= 2);
+      display->drm = wl_registry_bind(registry, name, &wl_drm_interface, 2);
+
+      if (display->drm)
+         wl_drm_add_listener(display->drm, &drm_listener, display);
+   }
+}
+
+static void
+registry_handle_global_remove(void *data, struct wl_registry *registry,
+                              uint32_t name)
+{ /* No-op */ }
+
+static const struct wl_registry_listener registry_listener = {
+   registry_handle_global,
+   registry_handle_global_remove
+};
+
+static void
+wsi_wl_display_destroy(struct wsi_wayland *wsi, struct wsi_wl_display *display)
+{
+   anv_vector_finish(&display->formats);
+   if (display->drm)
+      wl_drm_destroy(display->drm);
+   anv_free(&wsi->instance->alloc, display);
+}
+
+static struct wsi_wl_display *
+wsi_wl_display_create(struct wsi_wayland *wsi, struct wl_display *wl_display)
+{
+   struct wsi_wl_display *display =
+      anv_alloc(&wsi->instance->alloc, sizeof(*display), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!display)
+      return NULL;
+
+   memset(display, 0, sizeof(*display));
+
+   display->display = wl_display;
+
+   if (!anv_vector_init(&display->formats, sizeof(VkFormat), 8))
+      goto fail;
+
+   struct wl_registry *registry = wl_display_get_registry(wl_display);
+   if (!registry)
+      return NULL;
+
+   wl_registry_add_listener(registry, &registry_listener, display);
+
+   /* Round-rip to get the wl_drm global */
+   wl_display_roundtrip(wl_display);
+
+   if (!display->drm)
+      goto fail;
+
+   /* Round-rip to get wl_drm formats and capabilities */
+   wl_display_roundtrip(wl_display);
+
+   /* We need prime support */
+   if (!(display->capabilities & WL_DRM_CAPABILITY_PRIME))
+      goto fail;
+
+   /* We don't need this anymore */
+   wl_registry_destroy(registry);
+
+   return display;
+
+fail:
+   if (registry)
+      wl_registry_destroy(registry);
+
+   wsi_wl_display_destroy(wsi, display);
+   return NULL;
+}
+
+static struct wsi_wl_display *
+wsi_wl_get_display(struct anv_instance *instance, struct wl_display *wl_display)
+{
+   struct wsi_wayland *wsi = instance->wayland_wsi;
+
+   pthread_mutex_lock(&wsi->mutex);
+
+   struct hash_entry *entry = _mesa_hash_table_search(wsi->displays,
+                                                      wl_display);
+   if (!entry) {
+      /* We're about to make a bunch of blocking calls.  Let's drop the
+       * mutex for now so we don't block up too badly.
+       */
+      pthread_mutex_unlock(&wsi->mutex);
+
+      struct wsi_wl_display *display = wsi_wl_display_create(wsi, wl_display);
+
+      pthread_mutex_lock(&wsi->mutex);
+
+      entry = _mesa_hash_table_search(wsi->displays, wl_display);
+      if (entry) {
+         /* Oops, someone raced us to it */
+         wsi_wl_display_destroy(wsi, display);
+      } else {
+         entry = _mesa_hash_table_insert(wsi->displays, wl_display, display);
+      }
+   }
+
+   pthread_mutex_unlock(&wsi->mutex);
+
+   return entry->data;
+}
+
+VkBool32 anv_GetPhysicalDeviceWaylandPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    struct wl_display*                          display)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+
+   return wsi_wl_get_display(physical_device->instance, display) != NULL;
+}
+
+static VkResult
+wsi_wl_surface_get_support(struct anv_wsi_surface *surface,
+                           struct anv_physical_device *device,
+                           uint32_t queueFamilyIndex,
+                           VkBool32* pSupported)
+{
+   *pSupported = true;
+
+   return VK_SUCCESS;
+}
+
+static const VkPresentModeKHR present_modes[] = {
+   VK_PRESENT_MODE_MAILBOX_KHR,
+   VK_PRESENT_MODE_FIFO_KHR,
+};
+
+static VkResult
+wsi_wl_surface_get_capabilities(struct anv_wsi_surface *surface,
+                                struct anv_physical_device *device,
+                                VkSurfaceCapabilitiesKHR* caps)
+{
+   caps->minImageCount = MIN_NUM_IMAGES;
+   caps->maxImageCount = 4;
+   caps->currentExtent = (VkExtent2D) { -1, -1 };
+   caps->minImageExtent = (VkExtent2D) { 1, 1 };
+   caps->maxImageExtent = (VkExtent2D) { INT16_MAX, INT16_MAX };
+   caps->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
+   caps->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
+   caps->maxImageArrayLayers = 1;
+
+   caps->supportedCompositeAlpha =
+      VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR |
+      VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR;
+
+   caps->supportedUsageFlags =
+      VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+      VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+wsi_wl_surface_get_formats(struct anv_wsi_surface *wsi_surface,
+                           struct anv_physical_device *device,
+                           uint32_t* pSurfaceFormatCount,
+                           VkSurfaceFormatKHR* pSurfaceFormats)
+{
+   struct wsi_wl_surface *surface = (struct wsi_wl_surface *)wsi_surface;
+   struct wsi_wl_display *display =
+      wsi_wl_get_display(device->instance, surface->display);
+
+   uint32_t count = anv_vector_length(&display->formats);
+
+   if (pSurfaceFormats == NULL) {
+      *pSurfaceFormatCount = count;
+      return VK_SUCCESS;
+   }
+
+   assert(*pSurfaceFormatCount >= count);
+   *pSurfaceFormatCount = count;
+
+   VkFormat *f;
+   anv_vector_foreach(f, &display->formats) {
+      *(pSurfaceFormats++) = (VkSurfaceFormatKHR) {
+         .format = *f,
+         /* TODO: We should get this from the compositor somehow */
+         .colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR,
+      };
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+wsi_wl_surface_get_present_modes(struct anv_wsi_surface *surface,
+                                 struct anv_physical_device *device,
+                                 uint32_t* pPresentModeCount,
+                                 VkPresentModeKHR* pPresentModes)
+{
+   if (pPresentModes == NULL) {
+      *pPresentModeCount = ARRAY_SIZE(present_modes);
+      return VK_SUCCESS;
+   }
+
+   assert(*pPresentModeCount >= ARRAY_SIZE(present_modes));
+   typed_memcpy(pPresentModes, present_modes, *pPresentModeCount);
+   *pPresentModeCount = ARRAY_SIZE(present_modes);
+
+   return VK_SUCCESS;
+}
+
+static void
+wsi_wl_surface_destroy(struct anv_wsi_surface *surface,
+                       const VkAllocationCallbacks *pAllocator)
+{
+   anv_free2(&surface->instance->alloc, pAllocator, surface);
+}
+
+static VkResult
+wsi_wl_surface_create_swapchain(struct anv_wsi_surface *surface,
+                                struct anv_device *device,
+                                const VkSwapchainCreateInfoKHR* pCreateInfo,
+                                const VkAllocationCallbacks* pAllocator,
+                                struct anv_swapchain **swapchain);
+
+VkResult anv_CreateWaylandSurfaceKHR(
+    VkInstance                                  _instance,
+    const VkWaylandSurfaceCreateInfoKHR*        pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface)
+{
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR);
+
+   ANV_FROM_HANDLE(anv_instance, instance, _instance);
+   struct wsi_wl_surface *surface;
+
+   surface = anv_alloc2(&instance->alloc, pAllocator, sizeof *surface, 8,
+                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (surface == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   surface->display = pCreateInfo->display;
+   surface->surface = pCreateInfo->surface;
+
+   surface->base.instance = instance;
+   surface->base.destroy = wsi_wl_surface_destroy;
+   surface->base.get_support = wsi_wl_surface_get_support;
+   surface->base.get_capabilities = wsi_wl_surface_get_capabilities;
+   surface->base.get_formats = wsi_wl_surface_get_formats;
+   surface->base.get_present_modes = wsi_wl_surface_get_present_modes;
+   surface->base.create_swapchain = wsi_wl_surface_create_swapchain;
+
+   *pSurface = anv_wsi_surface_to_handle(&surface->base);
+
+   return VK_SUCCESS;
+}
+
+struct wsi_wl_image {
+   struct anv_image *                           image;
+   struct anv_device_memory *                   memory;
+   struct wl_buffer *                           buffer;
+   bool                                         busy;
+};
+
+struct wsi_wl_swapchain {
+   struct anv_swapchain                        base;
+
+   struct wsi_wl_display *                      display;
+   struct wl_event_queue *                      queue;
+   struct wl_surface *                          surface;
+
+   VkExtent2D                                   extent;
+   VkFormat                                     vk_format;
+   uint32_t                                     drm_format;
+
+   VkPresentModeKHR                             present_mode;
+   bool                                         fifo_ready;
+
+   uint32_t                                     image_count;
+   struct wsi_wl_image                          images[0];
+};
+
+static VkResult
+wsi_wl_swapchain_get_images(struct anv_swapchain *anv_chain,
+                            uint32_t *pCount, VkImage *pSwapchainImages)
+{
+   struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)anv_chain;
+
+   if (pSwapchainImages == NULL) {
+      *pCount = chain->image_count;
+      return VK_SUCCESS;
+   }
+
+   assert(chain->image_count <= *pCount);
+   for (uint32_t i = 0; i < chain->image_count; i++)
+      pSwapchainImages[i] = anv_image_to_handle(chain->images[i].image);
+
+   *pCount = chain->image_count;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+wsi_wl_swapchain_acquire_next_image(struct anv_swapchain *anv_chain,
+                                    uint64_t timeout,
+                                    VkSemaphore semaphore,
+                                    uint32_t *image_index)
+{
+   struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)anv_chain;
+
+   int ret = wl_display_dispatch_queue_pending(chain->display->display,
+                                               chain->queue);
+   /* XXX: I'm not sure if out-of-date is the right error here.  If
+    * wl_display_dispatch_queue_pending fails it most likely means we got
+    * kicked by the server so this seems more-or-less correct.
+    */
+   if (ret < 0)
+      return vk_error(VK_ERROR_OUT_OF_DATE_KHR);
+
+   while (1) {
+      for (uint32_t i = 0; i < chain->image_count; i++) {
+         if (!chain->images[i].busy) {
+            /* We found a non-busy image */
+            *image_index = i;
+            return VK_SUCCESS;
+         }
+      }
+
+      /* This time we do a blocking dispatch because we can't go
+       * anywhere until we get an event.
+       */
+      int ret = wl_display_dispatch_queue(chain->display->display,
+                                          chain->queue);
+      if (ret < 0)
+         return vk_error(VK_ERROR_OUT_OF_DATE_KHR);
+   }
+}
+
+static void
+frame_handle_done(void *data, struct wl_callback *callback, uint32_t serial)
+{
+   struct wsi_wl_swapchain *chain = data;
+
+   chain->fifo_ready = true;
+
+   wl_callback_destroy(callback);
+}
+
+static const struct wl_callback_listener frame_listener = {
+   frame_handle_done,
+};
+
+static VkResult
+wsi_wl_swapchain_queue_present(struct anv_swapchain *anv_chain,
+                               struct anv_queue *queue,
+                               uint32_t image_index)
+{
+   struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)anv_chain;
+
+   if (chain->present_mode == VK_PRESENT_MODE_FIFO_KHR) {
+      while (!chain->fifo_ready) {
+         int ret = wl_display_dispatch_queue(chain->display->display,
+                                             chain->queue);
+         if (ret < 0)
+            return vk_error(VK_ERROR_OUT_OF_DATE_KHR);
+      }
+   }
+
+   assert(image_index < chain->image_count);
+   wl_surface_attach(chain->surface, chain->images[image_index].buffer, 0, 0);
+   wl_surface_damage(chain->surface, 0, 0, INT32_MAX, INT32_MAX);
+
+   if (chain->present_mode == VK_PRESENT_MODE_FIFO_KHR) {
+      struct wl_callback *frame = wl_surface_frame(chain->surface);
+      wl_proxy_set_queue((struct wl_proxy *)frame, chain->queue);
+      wl_callback_add_listener(frame, &frame_listener, chain);
+      chain->fifo_ready = false;
+   }
+
+   wl_surface_commit(chain->surface);
+   wl_display_flush(chain->display->display);
+
+   return VK_SUCCESS;
+}
+
+static void
+wsi_wl_image_finish(struct wsi_wl_swapchain *chain, struct wsi_wl_image *image,
+                    const VkAllocationCallbacks* pAllocator)
+{
+   VkDevice vk_device = anv_device_to_handle(chain->base.device);
+   anv_FreeMemory(vk_device, anv_device_memory_to_handle(image->memory),
+                  pAllocator);
+   anv_DestroyImage(vk_device, anv_image_to_handle(image->image),
+                    pAllocator);
+}
+
+static void
+buffer_handle_release(void *data, struct wl_buffer *buffer)
+{
+   struct wsi_wl_image *image = data;
+
+   assert(image->buffer == buffer);
+
+   image->busy = false;
+}
+
+static const struct wl_buffer_listener buffer_listener = {
+   buffer_handle_release,
+};
+
+static VkResult
+wsi_wl_image_init(struct wsi_wl_swapchain *chain, struct wsi_wl_image *image,
+                  const VkAllocationCallbacks* pAllocator)
+{
+   VkDevice vk_device = anv_device_to_handle(chain->base.device);
+   VkResult result;
+
+   VkImage vk_image;
+   result = anv_image_create(vk_device,
+      &(struct anv_image_create_info) {
+         .isl_tiling_flags = ISL_TILING_X_BIT,
+         .stride = 0,
+         .vk_info =
+      &(VkImageCreateInfo) {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+         .imageType = VK_IMAGE_TYPE_2D,
+         .format = chain->vk_format,
+         .extent = {
+            .width = chain->extent.width,
+            .height = chain->extent.height,
+            .depth = 1
+         },
+         .mipLevels = 1,
+         .arrayLayers = 1,
+         .samples = 1,
+         /* FIXME: Need a way to use X tiling to allow scanout */
+         .tiling = VK_IMAGE_TILING_OPTIMAL,
+         .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+         .flags = 0,
+      }},
+      pAllocator,
+      &vk_image);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   image->image = anv_image_from_handle(vk_image);
+   assert(anv_format_is_color(image->image->format));
+
+   struct anv_surface *surface = &image->image->color_surface;
+
+   VkDeviceMemory vk_memory;
+   result = anv_AllocateMemory(vk_device,
+      &(VkMemoryAllocateInfo) {
+         .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+         .allocationSize = image->image->size,
+         .memoryTypeIndex = 0,
+      },
+      pAllocator,
+      &vk_memory);
+
+   if (result != VK_SUCCESS)
+      goto fail_image;
+
+   image->memory = anv_device_memory_from_handle(vk_memory);
+
+   result = anv_BindImageMemory(vk_device, vk_image, vk_memory, 0);
+
+   if (result != VK_SUCCESS)
+      goto fail_mem;
+
+   int ret = anv_gem_set_tiling(chain->base.device,
+                                image->memory->bo.gem_handle,
+                                surface->isl.row_pitch, I915_TILING_X);
+   if (ret) {
+      /* FINISHME: Choose a better error. */
+      result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      goto fail_mem;
+   }
+
+   int fd = anv_gem_handle_to_fd(chain->base.device,
+                                 image->memory->bo.gem_handle);
+   if (fd == -1) {
+      /* FINISHME: Choose a better error. */
+      result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      goto fail_mem;
+   }
+
+   image->buffer = wl_drm_create_prime_buffer(chain->display->drm,
+                                              fd, /* name */
+                                              chain->extent.width,
+                                              chain->extent.height,
+                                              chain->drm_format,
+                                              surface->offset,
+                                              surface->isl.row_pitch,
+                                              0, 0, 0, 0 /* unused */);
+   wl_display_roundtrip(chain->display->display);
+   close(fd);
+
+   wl_proxy_set_queue((struct wl_proxy *)image->buffer, chain->queue);
+   wl_buffer_add_listener(image->buffer, &buffer_listener, image);
+
+   return VK_SUCCESS;
+
+fail_mem:
+   anv_FreeMemory(vk_device, vk_memory, pAllocator);
+fail_image:
+   anv_DestroyImage(vk_device, vk_image, pAllocator);
+
+   return result;
+}
+
+static VkResult
+wsi_wl_swapchain_destroy(struct anv_swapchain *anv_chain,
+                         const VkAllocationCallbacks *pAllocator)
+{
+   struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)anv_chain;
+
+   for (uint32_t i = 0; i < chain->image_count; i++) {
+      if (chain->images[i].buffer)
+         wsi_wl_image_finish(chain, &chain->images[i], pAllocator);
+   }
+
+   anv_free2(&chain->base.device->alloc, pAllocator, chain);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+wsi_wl_surface_create_swapchain(struct anv_wsi_surface *wsi_surface,
+                                struct anv_device *device,
+                                const VkSwapchainCreateInfoKHR* pCreateInfo,
+                                const VkAllocationCallbacks* pAllocator,
+                                struct anv_swapchain **swapchain_out)
+{
+   struct wsi_wl_surface *surface = (struct wsi_wl_surface *)wsi_surface;
+   struct wsi_wl_swapchain *chain;
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR);
+
+   int num_images = pCreateInfo->minImageCount;
+
+   assert(num_images >= MIN_NUM_IMAGES);
+
+   /* For true mailbox mode, we need at least 4 images:
+    *  1) One to scan out from
+    *  2) One to have queued for scan-out
+    *  3) One to be currently held by the Wayland compositor
+    *  4) One to render to
+    */
+   if (pCreateInfo->presentMode == VK_PRESENT_MODE_MAILBOX_KHR)
+      num_images = MAX2(num_images, 4);
+
+   size_t size = sizeof(*chain) + num_images * sizeof(chain->images[0]);
+   chain = anv_alloc2(&device->alloc, pAllocator, size, 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (chain == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   chain->base.device = device;
+   chain->base.destroy = wsi_wl_swapchain_destroy;
+   chain->base.get_images = wsi_wl_swapchain_get_images;
+   chain->base.acquire_next_image = wsi_wl_swapchain_acquire_next_image;
+   chain->base.queue_present = wsi_wl_swapchain_queue_present;
+
+   chain->surface = surface->surface;
+   chain->extent = pCreateInfo->imageExtent;
+   chain->vk_format = pCreateInfo->imageFormat;
+   chain->drm_format = wl_drm_format_for_vk_format(chain->vk_format, false);
+
+   chain->present_mode = pCreateInfo->presentMode;
+   chain->fifo_ready = true;
+
+   chain->image_count = num_images;
+
+   /* Mark a bunch of stuff as NULL.  This way we can just call
+    * destroy_swapchain for cleanup.
+    */
+   for (uint32_t i = 0; i < chain->image_count; i++)
+      chain->images[i].buffer = NULL;
+   chain->queue = NULL;
+
+   chain->display = wsi_wl_get_display(device->instance, surface->display);
+   if (!chain->display)
+      goto fail;
+
+   chain->queue = wl_display_create_queue(chain->display->display);
+   if (!chain->queue)
+      goto fail;
+
+   for (uint32_t i = 0; i < chain->image_count; i++) {
+      result = wsi_wl_image_init(chain, &chain->images[i], pAllocator);
+      if (result != VK_SUCCESS)
+         goto fail;
+      chain->images[i].busy = false;
+   }
+
+   *swapchain_out = &chain->base;
+
+   return VK_SUCCESS;
+
+fail:
+   wsi_wl_swapchain_destroy(&chain->base, pAllocator);
+
+   return result;
+}
+
+VkResult
+anv_wl_init_wsi(struct anv_instance *instance)
+{
+   struct wsi_wayland *wsi;
+   VkResult result;
+
+   wsi = anv_alloc(&instance->alloc, sizeof(*wsi), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!wsi) {
+      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail;
+   }
+
+   wsi->instance = instance;
+
+   int ret = pthread_mutex_init(&wsi->mutex, NULL);
+   if (ret != 0) {
+      if (ret == ENOMEM) {
+         result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      } else {
+         /* FINISHME: Choose a better error. */
+         result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      goto fail_alloc;
+   }
+
+   wsi->displays = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                           _mesa_key_pointer_equal);
+   if (!wsi->displays) {
+      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_mutex;
+   }
+
+   instance->wayland_wsi = wsi;
+
+   return VK_SUCCESS;
+
+fail_mutex:
+   pthread_mutex_destroy(&wsi->mutex);
+
+fail_alloc:
+   anv_free(&instance->alloc, wsi);
+fail:
+   instance->wayland_wsi = NULL;
+
+   return result;
+}
+
+void
+anv_wl_finish_wsi(struct anv_instance *instance)
+{
+   struct wsi_wayland *wsi = instance->wayland_wsi;
+
+   if (wsi) {
+      _mesa_hash_table_destroy(wsi->displays, NULL);
+
+      pthread_mutex_destroy(&wsi->mutex);
+
+      anv_free(&instance->alloc, wsi);
+   }
+}
diff --git a/src/vulkan/anv_wsi_x11.c b/src/vulkan/anv_wsi_x11.c

new file mode 100644 (file)

index 0000000..8e35191
--- /dev/null
+++ b/src/vulkan/anv_wsi_x11.c
@@ -0,0 +1,459 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <xcb/xcb.h>
+#include <xcb/dri3.h>
+#include <xcb/present.h>
+
+#include "anv_wsi.h"
+
+struct x11_surface {
+   struct anv_wsi_surface base;
+
+   xcb_connection_t *connection;
+   xcb_window_t window;
+};
+
+static const VkSurfaceFormatKHR formats[] = {
+   { .format = VK_FORMAT_B8G8R8A8_UNORM, },
+};
+
+static const VkPresentModeKHR present_modes[] = {
+   VK_PRESENT_MODE_MAILBOX_KHR,
+};
+
+VkBool32 anv_GetPhysicalDeviceXcbPresentationSupportKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    xcb_connection_t*                           connection,
+    xcb_visualid_t                              visual_id)
+{
+   anv_finishme("Check that we actually have DRI3");
+   stub_return(true);
+}
+
+static VkResult
+x11_surface_get_capabilities(struct anv_wsi_surface *wsi_surface,
+                             struct anv_physical_device *device,
+                             VkSurfaceCapabilitiesKHR *caps)
+{
+   struct x11_surface *surface = (struct x11_surface *)wsi_surface;
+
+   xcb_get_geometry_cookie_t cookie = xcb_get_geometry(surface->connection,
+                                                       surface->window);
+   xcb_generic_error_t *err;
+   xcb_get_geometry_reply_t *geom = xcb_get_geometry_reply(surface->connection,
+                                                           cookie, &err);
+   if (geom) {
+      VkExtent2D extent = { geom->width, geom->height };
+      caps->currentExtent = extent;
+      caps->minImageExtent = extent;
+      caps->maxImageExtent = extent;
+   } else {
+      /* This can happen if the client didn't wait for the configure event
+       * to come back from the compositor.  In that case, we don't know the
+       * size of the window so we just return valid "I don't know" stuff.
+       */
+      caps->currentExtent = (VkExtent2D) { -1, -1 };
+      caps->minImageExtent = (VkExtent2D) { 1, 1 };
+      caps->maxImageExtent = (VkExtent2D) { INT16_MAX, INT16_MAX };
+   }
+   free(err);
+   free(geom);
+
+   caps->minImageCount = 2;
+   caps->maxImageCount = 4;
+   caps->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
+   caps->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
+   caps->maxImageArrayLayers = 1;
+   caps->supportedCompositeAlpha = VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR;
+   caps->supportedUsageFlags =
+      VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+      VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+x11_surface_get_formats(struct anv_wsi_surface *surface,
+                        struct anv_physical_device *device,
+                        uint32_t *pSurfaceFormatCount,
+                        VkSurfaceFormatKHR *pSurfaceFormats)
+{
+   if (pSurfaceFormats == NULL) {
+      *pSurfaceFormatCount = ARRAY_SIZE(formats);
+      return VK_SUCCESS;
+   }
+
+   assert(*pSurfaceFormatCount >= ARRAY_SIZE(formats));
+   typed_memcpy(pSurfaceFormats, formats, *pSurfaceFormatCount);
+   *pSurfaceFormatCount = ARRAY_SIZE(formats);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+x11_surface_get_present_modes(struct anv_wsi_surface *surface,
+                              struct anv_physical_device *device,
+                              uint32_t *pPresentModeCount,
+                              VkPresentModeKHR *pPresentModes)
+{
+   if (pPresentModes == NULL) {
+      *pPresentModeCount = ARRAY_SIZE(present_modes);
+      return VK_SUCCESS;
+   }
+
+   assert(*pPresentModeCount >= ARRAY_SIZE(present_modes));
+   typed_memcpy(pPresentModes, present_modes, *pPresentModeCount);
+   *pPresentModeCount = ARRAY_SIZE(present_modes);
+
+   return VK_SUCCESS;
+}
+
+static void
+x11_surface_destroy(struct anv_wsi_surface *surface,
+                    const VkAllocationCallbacks *pAllocator)
+{
+   anv_free2(&surface->instance->alloc, pAllocator, surface);
+}
+
+static VkResult
+x11_surface_create_swapchain(struct anv_wsi_surface *surface,
+                             struct anv_device *device,
+                             const VkSwapchainCreateInfoKHR* pCreateInfo,
+                             const VkAllocationCallbacks* pAllocator,
+                             struct anv_swapchain **swapchain);
+
+VkResult anv_CreateXcbSurfaceKHR(
+    VkInstance                                  _instance,
+    const VkXcbSurfaceCreateInfoKHR*            pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSurfaceKHR*                               pSurface)
+{
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR);
+
+   ANV_FROM_HANDLE(anv_instance, instance, _instance);
+   struct x11_surface *surface;
+
+   surface = anv_alloc2(&instance->alloc, pAllocator, sizeof *surface, 8,
+                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (surface == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   surface->connection = pCreateInfo->connection;
+   surface->window = pCreateInfo->window;
+
+   surface->base.instance = instance;
+   surface->base.destroy = x11_surface_destroy;
+   surface->base.get_capabilities = x11_surface_get_capabilities;
+   surface->base.get_formats = x11_surface_get_formats;
+   surface->base.get_present_modes = x11_surface_get_present_modes;
+   surface->base.create_swapchain = x11_surface_create_swapchain;
+
+   *pSurface = anv_wsi_surface_to_handle(&surface->base);
+
+   return VK_SUCCESS;
+}
+
+struct x11_image {
+   struct anv_image *                        image;
+   struct anv_device_memory *                memory;
+   xcb_pixmap_t                              pixmap;
+   xcb_get_geometry_cookie_t                 geom_cookie;
+   bool                                      busy;
+};
+
+struct x11_swapchain {
+   struct anv_swapchain                        base;
+
+   xcb_connection_t *                           conn;
+   xcb_window_t                                 window;
+   xcb_gc_t                                     gc;
+   VkExtent2D                                   extent;
+   uint32_t                                     image_count;
+   uint32_t                                     next_image;
+   struct x11_image                             images[0];
+};
+
+static VkResult
+x11_get_images(struct anv_swapchain *anv_chain,
+               uint32_t* pCount, VkImage *pSwapchainImages)
+{
+   struct x11_swapchain *chain = (struct x11_swapchain *)anv_chain;
+
+   if (pSwapchainImages == NULL) {
+      *pCount = chain->image_count;
+      return VK_SUCCESS;
+   }
+
+   assert(chain->image_count <= *pCount);
+   for (uint32_t i = 0; i < chain->image_count; i++)
+      pSwapchainImages[i] = anv_image_to_handle(chain->images[i].image);
+
+   *pCount = chain->image_count;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+x11_acquire_next_image(struct anv_swapchain *anv_chain,
+                       uint64_t timeout,
+                       VkSemaphore semaphore,
+                       uint32_t *image_index)
+{
+   struct x11_swapchain *chain = (struct x11_swapchain *)anv_chain;
+   struct x11_image *image = &chain->images[chain->next_image];
+
+   if (image->busy) {
+      xcb_generic_error_t *err;
+      xcb_get_geometry_reply_t *geom =
+         xcb_get_geometry_reply(chain->conn, image->geom_cookie, &err);
+      if (!geom) {
+         free(err);
+         return vk_error(VK_ERROR_OUT_OF_DATE_KHR);
+      }
+
+      if (geom->width != chain->extent.width ||
+          geom->height != chain->extent.height) {
+         free(geom);
+         return vk_error(VK_ERROR_OUT_OF_DATE_KHR);
+      }
+      free(geom);
+
+      image->busy = false;
+   }
+
+   *image_index = chain->next_image;
+   chain->next_image = (chain->next_image + 1) % chain->image_count;
+   return VK_SUCCESS;
+}
+
+static VkResult
+x11_queue_present(struct anv_swapchain *anv_chain,
+                  struct anv_queue *queue,
+                  uint32_t image_index)
+{
+   struct x11_swapchain *chain = (struct x11_swapchain *)anv_chain;
+   struct x11_image *image = &chain->images[image_index];
+
+   assert(image_index < chain->image_count);
+
+   xcb_void_cookie_t cookie;
+
+   cookie = xcb_copy_area(chain->conn,
+                          image->pixmap,
+                          chain->window,
+                          chain->gc,
+                          0, 0,
+                          0, 0,
+                          chain->extent.width,
+                          chain->extent.height);
+   xcb_discard_reply(chain->conn, cookie.sequence);
+
+   image->geom_cookie = xcb_get_geometry(chain->conn, chain->window);
+   image->busy = true;
+
+   xcb_flush(chain->conn);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+x11_swapchain_destroy(struct anv_swapchain *anv_chain,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   struct x11_swapchain *chain = (struct x11_swapchain *)anv_chain;
+   xcb_void_cookie_t cookie;
+
+   for (uint32_t i = 0; i < chain->image_count; i++) {
+      struct x11_image *image = &chain->images[i];
+
+      if (image->busy)
+         xcb_discard_reply(chain->conn, image->geom_cookie.sequence);
+
+      cookie = xcb_free_pixmap(chain->conn, image->pixmap);
+      xcb_discard_reply(chain->conn, cookie.sequence);
+
+      /* TODO: Delete images and free memory */
+   }
+
+   anv_free(NULL /* XXX: pAllocator */, chain);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+x11_surface_create_swapchain(struct anv_wsi_surface *wsi_surface,
+                             struct anv_device *device,
+                             const VkSwapchainCreateInfoKHR *pCreateInfo,
+                             const VkAllocationCallbacks* pAllocator,
+                             struct anv_swapchain **swapchain_out)
+{
+   struct x11_surface *surface = (struct x11_surface *)wsi_surface;
+   struct x11_swapchain *chain;
+   xcb_void_cookie_t cookie;
+   VkResult result;
+
+   int num_images = pCreateInfo->minImageCount;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR);
+
+   size_t size = sizeof(*chain) + num_images * sizeof(chain->images[0]);
+   chain = anv_alloc2(&device->alloc, pAllocator, size, 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (chain == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   chain->base.device = device;
+   chain->base.destroy = x11_swapchain_destroy;
+   chain->base.get_images = x11_get_images;
+   chain->base.acquire_next_image = x11_acquire_next_image;
+   chain->base.queue_present = x11_queue_present;
+
+   chain->conn = surface->connection;
+   chain->window = surface->window;
+   chain->extent = pCreateInfo->imageExtent;
+   chain->image_count = num_images;
+   chain->next_image = 0;
+
+   for (uint32_t i = 0; i < chain->image_count; i++) {
+      VkDeviceMemory memory_h;
+      VkImage image_h;
+      struct anv_image *image;
+      struct anv_surface *surface;
+      struct anv_device_memory *memory;
+
+      anv_image_create(anv_device_to_handle(device),
+         &(struct anv_image_create_info) {
+            .isl_tiling_flags = ISL_TILING_X_BIT,
+            .stride = 0,
+            .vk_info =
+         &(VkImageCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+            .imageType = VK_IMAGE_TYPE_2D,
+            .format = pCreateInfo->imageFormat,
+            .extent = {
+               .width = pCreateInfo->imageExtent.width,
+               .height = pCreateInfo->imageExtent.height,
+               .depth = 1
+            },
+            .mipLevels = 1,
+            .arrayLayers = 1,
+            .samples = 1,
+            /* FIXME: Need a way to use X tiling to allow scanout */
+            .tiling = VK_IMAGE_TILING_OPTIMAL,
+            .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+            .flags = 0,
+         }},
+         NULL,
+         &image_h);
+
+      image = anv_image_from_handle(image_h);
+      assert(anv_format_is_color(image->format));
+
+      surface = &image->color_surface;
+
+      anv_AllocateMemory(anv_device_to_handle(device),
+         &(VkMemoryAllocateInfo) {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+            .allocationSize = image->size,
+            .memoryTypeIndex = 0,
+         },
+         NULL /* XXX: pAllocator */,
+         &memory_h);
+
+      memory = anv_device_memory_from_handle(memory_h);
+
+      anv_BindImageMemory(VK_NULL_HANDLE, anv_image_to_handle(image),
+                          memory_h, 0);
+
+      int ret = anv_gem_set_tiling(device, memory->bo.gem_handle,
+                                   surface->isl.row_pitch, I915_TILING_X);
+      if (ret) {
+         /* FINISHME: Choose a better error. */
+         result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                            "set_tiling failed: %m");
+         goto fail;
+      }
+
+      int fd = anv_gem_handle_to_fd(device, memory->bo.gem_handle);
+      if (fd == -1) {
+         /* FINISHME: Choose a better error. */
+         result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                            "handle_to_fd failed: %m");
+         goto fail;
+      }
+
+      uint32_t bpp = 32;
+      uint32_t depth = 24;
+      xcb_pixmap_t pixmap = xcb_generate_id(chain->conn);
+
+      cookie =
+         xcb_dri3_pixmap_from_buffer_checked(chain->conn,
+                                             pixmap,
+                                             chain->window,
+                                             image->size,
+                                             pCreateInfo->imageExtent.width,
+                                             pCreateInfo->imageExtent.height,
+                                             surface->isl.row_pitch,
+                                             depth, bpp, fd);
+
+      chain->images[i].image = image;
+      chain->images[i].memory = memory;
+      chain->images[i].pixmap = pixmap;
+      chain->images[i].busy = false;
+
+      xcb_discard_reply(chain->conn, cookie.sequence);
+   }
+
+   chain->gc = xcb_generate_id(chain->conn);
+   if (!chain->gc) {
+      /* FINISHME: Choose a better error. */
+      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail;
+   }
+
+   cookie = xcb_create_gc(chain->conn,
+                          chain->gc,
+                          chain->window,
+                          XCB_GC_GRAPHICS_EXPOSURES,
+                          (uint32_t []) { 0 });
+   xcb_discard_reply(chain->conn, cookie.sequence);
+
+   *swapchain_out = &chain->base;
+
+   return VK_SUCCESS;
+
+ fail:
+   return result;
+}
+
+VkResult
+anv_x11_init_wsi(struct anv_instance *instance)
+{
+   return VK_SUCCESS;
+}
+
+void
+anv_x11_finish_wsi(struct anv_instance *instance)
+{ }
diff --git a/src/vulkan/gen75_pack.h b/src/vulkan/gen75_pack.h

new file mode 100644 (file)

index 0000000..b012032
--- /dev/null
+++ b/src/vulkan/gen75_pack.h
@@ -0,0 +1,8421 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+/* Instructions, enums and structures for HSW.
+ *
+ * This file has been generated, do not hand edit.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <assert.h>
+
+#ifndef __gen_validate_value
+#define __gen_validate_value(x)
+#endif
+
+#ifndef __gen_field_functions
+#define __gen_field_functions
+
+union __gen_value {
+   float f;
+   uint32_t dw;
+};
+
+static inline uint64_t
+__gen_mbo(uint32_t start, uint32_t end)
+{
+   return (~0ul >> (64 - (end - start + 1))) << start;
+}
+
+static inline uint64_t
+__gen_field(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   if (end - start + 1 < 64)
+      assert(v < 1ul << (end - start + 1));
+#endif
+
+   return v << start;
+}
+
+static inline uint64_t
+__gen_fixed(float v, uint32_t start, uint32_t end,
+            bool is_signed, uint32_t fract_bits)
+{
+   __gen_validate_value(v);
+
+   const float factor = (1 << fract_bits);
+
+   float max, min;
+   if (is_signed) {
+      max = ((1 << (end - start)) - 1) / factor;
+      min = -(1 << (end - start)) / factor;
+   } else {
+      max = ((1 << (end - start + 1)) - 1) / factor;
+      min = 0.0f;
+   }
+
+   if (v > max)
+      v = max;
+   else if (v < min)
+      v = min;
+
+   int32_t int_val = roundf(v * factor);
+
+   if (is_signed)
+      int_val &= (1 << (end - start + 1)) - 1;
+
+   return int_val << start;
+}
+
+static inline uint64_t
+__gen_offset(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   uint64_t mask = (~0ul >> (64 - (end - start + 1))) << start;
+
+   assert((v & ~mask) == 0);
+#endif
+
+   return v;
+}
+
+static inline uint32_t
+__gen_float(float v)
+{
+   __gen_validate_value(v);
+   return ((union __gen_value) { .f = (v) }).dw;
+}
+
+#ifndef __gen_address_type
+#error #define __gen_address_type before including this file
+#endif
+
+#ifndef __gen_user_data
+#error #define __gen_combine_address before including this file
+#endif
+
+#endif
+
+#define GEN75_3DSTATE_URB_VS_length_bias 0x00000002
+#define GEN75_3DSTATE_URB_VS_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 48,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_URB_VS_length 0x00000002
+
+struct GEN75_3DSTATE_URB_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     VSURBStartingAddress;
+   uint32_t                                     VSURBEntryAllocationSize;
+   uint32_t                                     VSNumberofURBEntries;
+};
+
+static inline void
+GEN75_3DSTATE_URB_VS_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN75_3DSTATE_URB_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->VSURBStartingAddress, 25, 30) |
+      __gen_field(values->VSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->VSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN75_GPGPU_CSR_BASE_ADDRESS_length_bias 0x00000002
+#define GEN75_GPGPU_CSR_BASE_ADDRESS_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  0
+
+#define GEN75_GPGPU_CSR_BASE_ADDRESS_length 0x00000002
+
+struct GEN75_GPGPU_CSR_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GPGPUCSRBaseAddress;
+};
+
+static inline void
+GEN75_GPGPU_CSR_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN75_GPGPU_CSR_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->GPGPUCSRBaseAddress, dw1);
+
+}
+
+#define GEN75_MI_STORE_REGISTER_MEM_length_bias 0x00000002
+#define GEN75_MI_STORE_REGISTER_MEM_header      \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 36,                  \
+   .DwordLength          =  1
+
+#define GEN75_MI_STORE_REGISTER_MEM_length 0x00000003
+
+struct GEN75_MI_STORE_REGISTER_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN75_MI_STORE_REGISTER_MEM_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN75_MI_STORE_REGISTER_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->PredicateEnable, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterAddress, 2, 22) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+}
+
+#define GEN75_PIPELINE_SELECT_length_bias 0x00000001
+#define GEN75_PIPELINE_SELECT_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  1,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4
+
+#define GEN75_PIPELINE_SELECT_length 0x00000001
+
+struct GEN75_PIPELINE_SELECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+#define     _3D                                                0
+#define     Media                                              1
+#define     GPGPU                                              2
+   uint32_t                                     PipelineSelection;
+};
+
+static inline void
+GEN75_PIPELINE_SELECT_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN75_PIPELINE_SELECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->PipelineSelection, 0, 1) |
+      0;
+
+}
+
+#define GEN75_STATE_BASE_ADDRESS_length_bias 0x00000002
+#define GEN75_STATE_BASE_ADDRESS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  1,                  \
+   .DwordLength          =  8
+
+#define GEN75_STATE_BASE_ADDRESS_length 0x0000000a
+
+#define GEN75_MEMORY_OBJECT_CONTROL_STATE_length 0x00000001
+
+struct GEN75_MEMORY_OBJECT_CONTROL_STATE {
+   uint32_t                                     LLCeLLCCacheabilityControlLLCCC;
+   uint32_t                                     L3CacheabilityControlL3CC;
+};
+
+static inline void
+GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN75_MEMORY_OBJECT_CONTROL_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->LLCeLLCCacheabilityControlLLCCC, 1, 2) |
+      __gen_field(values->L3CacheabilityControlL3CC, 0, 0) |
+      0;
+
+}
+
+struct GEN75_STATE_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GeneralStateBaseAddress;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     GeneralStateMemoryObjectControlState;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     StatelessDataPortAccessMemoryObjectControlState;
+   bool                                         GeneralStateBaseAddressModifyEnable;
+   __gen_address_type                           SurfaceStateBaseAddress;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     SurfaceStateMemoryObjectControlState;
+   bool                                         SurfaceStateBaseAddressModifyEnable;
+   __gen_address_type                           DynamicStateBaseAddress;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     DynamicStateMemoryObjectControlState;
+   bool                                         DynamicStateBaseAddressModifyEnable;
+   __gen_address_type                           IndirectObjectBaseAddress;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     IndirectObjectMemoryObjectControlState;
+   bool                                         IndirectObjectBaseAddressModifyEnable;
+   __gen_address_type                           InstructionBaseAddress;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     InstructionMemoryObjectControlState;
+   bool                                         InstructionBaseAddressModifyEnable;
+   __gen_address_type                           GeneralStateAccessUpperBound;
+   bool                                         GeneralStateAccessUpperBoundModifyEnable;
+   __gen_address_type                           DynamicStateAccessUpperBound;
+   bool                                         DynamicStateAccessUpperBoundModifyEnable;
+   __gen_address_type                           IndirectObjectAccessUpperBound;
+   bool                                         IndirectObjectAccessUpperBoundModifyEnable;
+   __gen_address_type                           InstructionAccessUpperBound;
+   bool                                         InstructionAccessUpperBoundModifyEnable;
+};
+
+static inline void
+GEN75_STATE_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN75_STATE_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_GeneralStateMemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_GeneralStateMemoryObjectControlState, &values->GeneralStateMemoryObjectControlState);
+   uint32_t dw_StatelessDataPortAccessMemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_StatelessDataPortAccessMemoryObjectControlState, &values->StatelessDataPortAccessMemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(dw_GeneralStateMemoryObjectControlState, 8, 11) |
+      __gen_field(dw_StatelessDataPortAccessMemoryObjectControlState, 4, 7) |
+      __gen_field(values->GeneralStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->GeneralStateBaseAddress, dw1);
+
+   uint32_t dw_SurfaceStateMemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceStateMemoryObjectControlState, &values->SurfaceStateMemoryObjectControlState);
+   uint32_t dw2 =
+      __gen_field(dw_SurfaceStateMemoryObjectControlState, 8, 11) |
+      __gen_field(values->SurfaceStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceStateBaseAddress, dw2);
+
+   uint32_t dw_DynamicStateMemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_DynamicStateMemoryObjectControlState, &values->DynamicStateMemoryObjectControlState);
+   uint32_t dw3 =
+      __gen_field(dw_DynamicStateMemoryObjectControlState, 8, 11) |
+      __gen_field(values->DynamicStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->DynamicStateBaseAddress, dw3);
+
+   uint32_t dw_IndirectObjectMemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_IndirectObjectMemoryObjectControlState, &values->IndirectObjectMemoryObjectControlState);
+   uint32_t dw4 =
+      __gen_field(dw_IndirectObjectMemoryObjectControlState, 8, 11) |
+      __gen_field(values->IndirectObjectBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[4] =
+      __gen_combine_address(data, &dw[4], values->IndirectObjectBaseAddress, dw4);
+
+   uint32_t dw_InstructionMemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_InstructionMemoryObjectControlState, &values->InstructionMemoryObjectControlState);
+   uint32_t dw5 =
+      __gen_field(dw_InstructionMemoryObjectControlState, 8, 11) |
+      __gen_field(values->InstructionBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[5] =
+      __gen_combine_address(data, &dw[5], values->InstructionBaseAddress, dw5);
+
+   uint32_t dw6 =
+      __gen_field(values->GeneralStateAccessUpperBoundModifyEnable, 0, 0) |
+      0;
+
+   dw[6] =
+      __gen_combine_address(data, &dw[6], values->GeneralStateAccessUpperBound, dw6);
+
+   uint32_t dw7 =
+      __gen_field(values->DynamicStateAccessUpperBoundModifyEnable, 0, 0) |
+      0;
+
+   dw[7] =
+      __gen_combine_address(data, &dw[7], values->DynamicStateAccessUpperBound, dw7);
+
+   uint32_t dw8 =
+      __gen_field(values->IndirectObjectAccessUpperBoundModifyEnable, 0, 0) |
+      0;
+
+   dw[8] =
+      __gen_combine_address(data, &dw[8], values->IndirectObjectAccessUpperBound, dw8);
+
+   uint32_t dw9 =
+      __gen_field(values->InstructionAccessUpperBoundModifyEnable, 0, 0) |
+      0;
+
+   dw[9] =
+      __gen_combine_address(data, &dw[9], values->InstructionAccessUpperBound, dw9);
+
+}
+
+#define GEN75_STATE_PREFETCH_length_bias 0x00000002
+#define GEN75_STATE_PREFETCH_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  3,                  \
+   .DwordLength          =  0
+
+#define GEN75_STATE_PREFETCH_length 0x00000002
+
+struct GEN75_STATE_PREFETCH {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           PrefetchPointer;
+   uint32_t                                     PrefetchCount;
+};
+
+static inline void
+GEN75_STATE_PREFETCH_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN75_STATE_PREFETCH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->PrefetchCount, 0, 2) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->PrefetchPointer, dw1);
+
+}
+
+#define GEN75_STATE_SIP_length_bias 0x00000002
+#define GEN75_STATE_SIP_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  2,                  \
+   .DwordLength          =  0
+
+#define GEN75_STATE_SIP_length 0x00000002
+
+struct GEN75_STATE_SIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SystemInstructionPointer;
+};
+
+static inline void
+GEN75_STATE_SIP_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN75_STATE_SIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SystemInstructionPointer, 4, 31) |
+      0;
+
+}
+
+#define GEN75_SWTESS_BASE_ADDRESS_length_bias 0x00000002
+#define GEN75_SWTESS_BASE_ADDRESS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  3,                  \
+   .DwordLength          =  0
+
+#define GEN75_SWTESS_BASE_ADDRESS_length 0x00000002
+
+struct GEN75_SWTESS_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           SWTessellationBaseAddress;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     SWTessellationMemoryObjectControlState;
+};
+
+static inline void
+GEN75_SWTESS_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_SWTESS_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SWTessellationMemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SWTessellationMemoryObjectControlState, &values->SWTessellationMemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(dw_SWTessellationMemoryObjectControlState, 8, 11) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->SWTessellationBaseAddress, dw1);
+
+}
+
+#define GEN75_3DPRIMITIVE_length_bias 0x00000002
+#define GEN75_3DPRIMITIVE_header                \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  3,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DPRIMITIVE_length 0x00000007
+
+struct GEN75_3DPRIMITIVE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         IndirectParameterEnable;
+   uint32_t                                     UAVCoherencyRequired;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   bool                                         EndOffsetEnable;
+#define     SEQUENTIAL                                         0
+#define     RANDOM                                             1
+   uint32_t                                     VertexAccessType;
+   uint32_t                                     PrimitiveTopologyType;
+   uint32_t                                     VertexCountPerInstance;
+   uint32_t                                     StartVertexLocation;
+   uint32_t                                     InstanceCount;
+   uint32_t                                     StartInstanceLocation;
+   uint32_t                                     BaseVertexLocation;
+};
+
+static inline void
+GEN75_3DPRIMITIVE_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN75_3DPRIMITIVE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->IndirectParameterEnable, 10, 10) |
+      __gen_field(values->UAVCoherencyRequired, 9, 9) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->EndOffsetEnable, 9, 9) |
+      __gen_field(values->VertexAccessType, 8, 8) |
+      __gen_field(values->PrimitiveTopologyType, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->VertexCountPerInstance, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->StartVertexLocation, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->InstanceCount, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->StartInstanceLocation, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->BaseVertexLocation, 0, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_AA_LINE_PARAMETERS_length_bias 0x00000002
+#define GEN75_3DSTATE_AA_LINE_PARAMETERS_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 10,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_AA_LINE_PARAMETERS_length 0x00000003
+
+struct GEN75_3DSTATE_AA_LINE_PARAMETERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        AACoverageBias;
+   float                                        AACoverageSlope;
+   float                                        AACoverageEndCapBias;
+   float                                        AACoverageEndCapSlope;
+};
+
+static inline void
+GEN75_3DSTATE_AA_LINE_PARAMETERS_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_3DSTATE_AA_LINE_PARAMETERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AACoverageBias * (1 << 8), 16, 23) |
+      __gen_field(values->AACoverageSlope * (1 << 8), 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->AACoverageEndCapBias * (1 << 8), 16, 23) |
+      __gen_field(values->AACoverageEndCapSlope * (1 << 8), 0, 7) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_DS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 70
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_DS_length 0x00000000
+
+#define GEN75_BINDING_TABLE_EDIT_ENTRY_length 0x00000001
+
+struct GEN75_BINDING_TABLE_EDIT_ENTRY {
+   uint32_t                                     BindingTableIndex;
+   uint32_t                                     SurfaceStatePointer;
+};
+
+static inline void
+GEN75_BINDING_TABLE_EDIT_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN75_BINDING_TABLE_EDIT_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->BindingTableIndex, 16, 23) |
+      __gen_offset(values->SurfaceStatePointer, 0, 15) |
+      0;
+
+}
+
+struct GEN75_3DSTATE_BINDING_TABLE_EDIT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_EDIT_DS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN75_3DSTATE_BINDING_TABLE_EDIT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_GS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 68
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_GS_length 0x00000000
+
+struct GEN75_3DSTATE_BINDING_TABLE_EDIT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_EDIT_GS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN75_3DSTATE_BINDING_TABLE_EDIT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_HS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 69
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_HS_length 0x00000000
+
+struct GEN75_3DSTATE_BINDING_TABLE_EDIT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_EDIT_HS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN75_3DSTATE_BINDING_TABLE_EDIT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_PS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 71
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_PS_length 0x00000000
+
+struct GEN75_3DSTATE_BINDING_TABLE_EDIT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_EDIT_PS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN75_3DSTATE_BINDING_TABLE_EDIT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_VS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 67
+
+#define GEN75_3DSTATE_BINDING_TABLE_EDIT_VS_length 0x00000000
+
+struct GEN75_3DSTATE_BINDING_TABLE_EDIT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_EDIT_VS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN75_3DSTATE_BINDING_TABLE_EDIT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_DS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 40,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_DS_length 0x00000002
+
+struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDSBindingTable;
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_POINTERS_DS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_GS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 41,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_GS_length 0x00000002
+
+struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoGSBindingTable;
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_POINTERS_GS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoGSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_HS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 39,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_HS_length 0x00000002
+
+struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoHSBindingTable;
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_POINTERS_HS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoHSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_PS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 42,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_PS_length 0x00000002
+
+struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoPSBindingTable;
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_POINTERS_PS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoPSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_VS_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 38,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_BINDING_TABLE_POINTERS_VS_length 0x00000002
+
+struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoVSBindingTable;
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_POINTERS_VS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_BINDING_TABLE_POINTERS_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoVSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC_length_bias 0x00000002
+#define GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 25,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC_length 0x00000003
+
+struct GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BindingTablePoolBaseAddress;
+   uint32_t                                     BindingTablePoolEnable;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     SurfaceObjectControlState;
+   __gen_address_type                           BindingTablePoolUpperBound;
+};
+
+static inline void
+GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SurfaceObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceObjectControlState, &values->SurfaceObjectControlState);
+   uint32_t dw1 =
+      __gen_field(values->BindingTablePoolEnable, 11, 11) |
+      __gen_field(dw_SurfaceObjectControlState, 7, 10) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->BindingTablePoolBaseAddress, dw1);
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->BindingTablePoolUpperBound, dw2);
+
+}
+
+#define GEN75_3DSTATE_BLEND_STATE_POINTERS_length_bias 0x00000002
+#define GEN75_3DSTATE_BLEND_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 36,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_BLEND_STATE_POINTERS_length 0x00000002
+
+struct GEN75_3DSTATE_BLEND_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BlendStatePointer;
+};
+
+static inline void
+GEN75_3DSTATE_BLEND_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN75_3DSTATE_BLEND_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->BlendStatePointer, 6, 31) |
+      __gen_mbo(0, 0) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_CC_STATE_POINTERS_length_bias 0x00000002
+#define GEN75_3DSTATE_CC_STATE_POINTERS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 14,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_CC_STATE_POINTERS_length 0x00000002
+
+struct GEN75_3DSTATE_CC_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ColorCalcStatePointer;
+};
+
+static inline void
+GEN75_3DSTATE_CC_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN75_3DSTATE_CC_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ColorCalcStatePointer, 6, 31) |
+      __gen_mbo(0, 0) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_CHROMA_KEY_length_bias 0x00000002
+#define GEN75_3DSTATE_CHROMA_KEY_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  2
+
+#define GEN75_3DSTATE_CHROMA_KEY_length 0x00000004
+
+struct GEN75_3DSTATE_CHROMA_KEY {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ChromaKeyTableIndex;
+   uint32_t                                     ChromaKeyLowValue;
+   uint32_t                                     ChromaKeyHighValue;
+};
+
+static inline void
+GEN75_3DSTATE_CHROMA_KEY_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN75_3DSTATE_CHROMA_KEY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ChromaKeyTableIndex, 30, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChromaKeyLowValue, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ChromaKeyHighValue, 0, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_CLEAR_PARAMS_length_bias 0x00000002
+#define GEN75_3DSTATE_CLEAR_PARAMS_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_CLEAR_PARAMS_length 0x00000003
+
+struct GEN75_3DSTATE_CLEAR_PARAMS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     DepthClearValue;
+   bool                                         DepthClearValueValid;
+};
+
+static inline void
+GEN75_3DSTATE_CLEAR_PARAMS_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_3DSTATE_CLEAR_PARAMS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DepthClearValue, 0, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DepthClearValueValid, 0, 0) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_CLIP_length_bias 0x00000002
+#define GEN75_3DSTATE_CLIP_header               \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 18,                  \
+   .DwordLength          =  2
+
+#define GEN75_3DSTATE_CLIP_length 0x00000004
+
+struct GEN75_3DSTATE_CLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     FrontWinding;
+   uint32_t                                     VertexSubPixelPrecisionSelect;
+   bool                                         EarlyCullEnable;
+#define     CULLMODE_BOTH                                      0
+#define     CULLMODE_NONE                                      1
+#define     CULLMODE_FRONT                                     2
+#define     CULLMODE_BACK                                      3
+   uint32_t                                     CullMode;
+   bool                                         ClipperStatisticsEnable;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+   bool                                         ClipEnable;
+#define     APIMODE_OGL                                        0
+   uint32_t                                     APIMode;
+   bool                                         ViewportXYClipTestEnable;
+   bool                                         ViewportZClipTestEnable;
+   bool                                         GuardbandClipTestEnable;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+#define     CLIPMODE_NORMAL                                    0
+#define     CLIPMODE_REJECT_ALL                                3
+#define     CLIPMODE_ACCEPT_ALL                                4
+   uint32_t                                     ClipMode;
+   bool                                         PerspectiveDivideDisable;
+   bool                                         NonPerspectiveBarycentricEnable;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+#define     Vertex2                                            2
+   uint32_t                                     TriangleStripListProvokingVertexSelect;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+   uint32_t                                     LineStripListProvokingVertexSelect;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+#define     Vertex2                                            2
+   uint32_t                                     TriangleFanProvokingVertexSelect;
+   float                                        MinimumPointWidth;
+   float                                        MaximumPointWidth;
+   bool                                         ForceZeroRTAIndexEnable;
+   uint32_t                                     MaximumVPIndex;
+};
+
+static inline void
+GEN75_3DSTATE_CLIP_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_3DSTATE_CLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->FrontWinding, 20, 20) |
+      __gen_field(values->VertexSubPixelPrecisionSelect, 19, 19) |
+      __gen_field(values->EarlyCullEnable, 18, 18) |
+      __gen_field(values->CullMode, 16, 17) |
+      __gen_field(values->ClipperStatisticsEnable, 10, 10) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClipEnable, 31, 31) |
+      __gen_field(values->APIMode, 30, 30) |
+      __gen_field(values->ViewportXYClipTestEnable, 28, 28) |
+      __gen_field(values->ViewportZClipTestEnable, 27, 27) |
+      __gen_field(values->GuardbandClipTestEnable, 26, 26) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 16, 23) |
+      __gen_field(values->ClipMode, 13, 15) |
+      __gen_field(values->PerspectiveDivideDisable, 9, 9) |
+      __gen_field(values->NonPerspectiveBarycentricEnable, 8, 8) |
+      __gen_field(values->TriangleStripListProvokingVertexSelect, 4, 5) |
+      __gen_field(values->LineStripListProvokingVertexSelect, 2, 3) |
+      __gen_field(values->TriangleFanProvokingVertexSelect, 0, 1) |
+      0;
+
+   dw[3] =
+      __gen_field(values->MinimumPointWidth * (1 << 3), 17, 27) |
+      __gen_field(values->MaximumPointWidth * (1 << 3), 6, 16) |
+      __gen_field(values->ForceZeroRTAIndexEnable, 5, 5) |
+      __gen_field(values->MaximumVPIndex, 0, 3) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_CONSTANT_DS_length_bias 0x00000002
+#define GEN75_3DSTATE_CONSTANT_DS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 26,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_CONSTANT_DS_length 0x00000007
+
+#define GEN75_3DSTATE_CONSTANT_BODY_length 0x00000006
+
+struct GEN75_3DSTATE_CONSTANT_BODY {
+   uint32_t                                     ConstantBuffer1ReadLength;
+   uint32_t                                     ConstantBuffer0ReadLength;
+   uint32_t                                     ConstantBuffer3ReadLength;
+   uint32_t                                     ConstantBuffer2ReadLength;
+   __gen_address_type                           PointerToConstantBuffer0;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     ConstantBufferObjectControlState;
+   __gen_address_type                           PointerToConstantBuffer1;
+   __gen_address_type                           PointerToConstantBuffer2;
+   __gen_address_type                           PointerToConstantBuffer3;
+};
+
+static inline void
+GEN75_3DSTATE_CONSTANT_BODY_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN75_3DSTATE_CONSTANT_BODY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ConstantBuffer1ReadLength, 16, 31) |
+      __gen_field(values->ConstantBuffer0ReadLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBuffer3ReadLength, 16, 31) |
+      __gen_field(values->ConstantBuffer2ReadLength, 0, 15) |
+      0;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   uint32_t dw2 =
+      __gen_field(dw_ConstantBufferObjectControlState, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->PointerToConstantBuffer0, dw2);
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->PointerToConstantBuffer1, dw3);
+
+   uint32_t dw4 =
+      0;
+
+   dw[4] =
+      __gen_combine_address(data, &dw[4], values->PointerToConstantBuffer2, dw4);
+
+   uint32_t dw5 =
+      0;
+
+   dw[5] =
+      __gen_combine_address(data, &dw[5], values->PointerToConstantBuffer3, dw5);
+
+}
+
+struct GEN75_3DSTATE_CONSTANT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN75_3DSTATE_CONSTANT_BODY           ConstantBody;
+};
+
+static inline void
+GEN75_3DSTATE_CONSTANT_DS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_3DSTATE_CONSTANT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN75_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN75_3DSTATE_CONSTANT_GS_length_bias 0x00000002
+#define GEN75_3DSTATE_CONSTANT_GS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 22,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_CONSTANT_GS_length 0x00000007
+
+struct GEN75_3DSTATE_CONSTANT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN75_3DSTATE_CONSTANT_BODY           ConstantBody;
+};
+
+static inline void
+GEN75_3DSTATE_CONSTANT_GS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_3DSTATE_CONSTANT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN75_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN75_3DSTATE_CONSTANT_HS_length_bias 0x00000002
+#define GEN75_3DSTATE_CONSTANT_HS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 25,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_CONSTANT_HS_length 0x00000007
+
+struct GEN75_3DSTATE_CONSTANT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN75_3DSTATE_CONSTANT_BODY           ConstantBody;
+};
+
+static inline void
+GEN75_3DSTATE_CONSTANT_HS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_3DSTATE_CONSTANT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN75_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN75_3DSTATE_CONSTANT_PS_length_bias 0x00000002
+#define GEN75_3DSTATE_CONSTANT_PS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 23,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_CONSTANT_PS_length 0x00000007
+
+struct GEN75_3DSTATE_CONSTANT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN75_3DSTATE_CONSTANT_BODY           ConstantBody;
+};
+
+static inline void
+GEN75_3DSTATE_CONSTANT_PS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_3DSTATE_CONSTANT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN75_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN75_3DSTATE_CONSTANT_VS_length_bias 0x00000002
+#define GEN75_3DSTATE_CONSTANT_VS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 21,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_CONSTANT_VS_length 0x00000007
+
+struct GEN75_3DSTATE_CONSTANT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN75_3DSTATE_CONSTANT_BODY           ConstantBody;
+};
+
+static inline void
+GEN75_3DSTATE_CONSTANT_VS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_3DSTATE_CONSTANT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN75_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN75_3DSTATE_DEPTH_BUFFER_length_bias 0x00000002
+#define GEN75_3DSTATE_DEPTH_BUFFER_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  5,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_DEPTH_BUFFER_length 0x00000007
+
+struct GEN75_3DSTATE_DEPTH_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     SURFTYPE_1D                                        0
+#define     SURFTYPE_2D                                        1
+#define     SURFTYPE_3D                                        2
+#define     SURFTYPE_CUBE                                      3
+#define     SURFTYPE_NULL                                      7
+   uint32_t                                     SurfaceType;
+   bool                                         DepthWriteEnable;
+   bool                                         StencilWriteEnable;
+   bool                                         HierarchicalDepthBufferEnable;
+#define     D32_FLOAT                                          1
+#define     D24_UNORM_X8_UINT                                  3
+#define     D16_UNORM                                          5
+   uint32_t                                     SurfaceFormat;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     Height;
+   uint32_t                                     Width;
+   uint32_t                                     LOD;
+#define     SURFTYPE_CUBEmustbezero                            0
+   uint32_t                                     Depth;
+   uint32_t                                     MinimumArrayElement;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     DepthBufferObjectControlState;
+   uint32_t                                     DepthCoordinateOffsetY;
+   uint32_t                                     DepthCoordinateOffsetX;
+   uint32_t                                     RenderTargetViewExtent;
+};
+
+static inline void
+GEN75_3DSTATE_DEPTH_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_3DSTATE_DEPTH_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SurfaceType, 29, 31) |
+      __gen_field(values->DepthWriteEnable, 28, 28) |
+      __gen_field(values->StencilWriteEnable, 27, 27) |
+      __gen_field(values->HierarchicalDepthBufferEnable, 22, 22) |
+      __gen_field(values->SurfaceFormat, 18, 20) |
+      __gen_field(values->SurfacePitch, 0, 17) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[3] =
+      __gen_field(values->Height, 18, 31) |
+      __gen_field(values->Width, 4, 17) |
+      __gen_field(values->LOD, 0, 3) |
+      0;
+
+   uint32_t dw_DepthBufferObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_DepthBufferObjectControlState, &values->DepthBufferObjectControlState);
+   dw[4] =
+      __gen_field(values->Depth, 21, 31) |
+      __gen_field(values->MinimumArrayElement, 10, 20) |
+      __gen_field(dw_DepthBufferObjectControlState, 0, 3) |
+      0;
+
+   dw[5] =
+      __gen_field(values->DepthCoordinateOffsetY, 16, 31) |
+      __gen_field(values->DepthCoordinateOffsetX, 0, 15) |
+      0;
+
+   dw[6] =
+      __gen_field(values->RenderTargetViewExtent, 21, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_DEPTH_STENCIL_STATE_POINTERS_length_bias 0x00000002
+#define GEN75_3DSTATE_DEPTH_STENCIL_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 37,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_DEPTH_STENCIL_STATE_POINTERS_length 0x00000002
+
+struct GEN75_3DSTATE_DEPTH_STENCIL_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDEPTH_STENCIL_STATE;
+};
+
+static inline void
+GEN75_3DSTATE_DEPTH_STENCIL_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                                const struct GEN75_3DSTATE_DEPTH_STENCIL_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDEPTH_STENCIL_STATE, 6, 31) |
+      __gen_mbo(0, 0) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_DRAWING_RECTANGLE_length_bias 0x00000002
+#define GEN75_3DSTATE_DRAWING_RECTANGLE_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  2
+
+#define GEN75_3DSTATE_DRAWING_RECTANGLE_length 0x00000004
+
+struct GEN75_3DSTATE_DRAWING_RECTANGLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+#define     Legacy                                             0
+#define     Core0Enabled                                       1
+#define     Core1Enabled                                       2
+   uint32_t                                     CoreModeSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ClippedDrawingRectangleYMin;
+   uint32_t                                     ClippedDrawingRectangleXMin;
+   uint32_t                                     ClippedDrawingRectangleYMax;
+   uint32_t                                     ClippedDrawingRectangleXMax;
+   uint32_t                                     DrawingRectangleOriginY;
+   uint32_t                                     DrawingRectangleOriginX;
+};
+
+static inline void
+GEN75_3DSTATE_DRAWING_RECTANGLE_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN75_3DSTATE_DRAWING_RECTANGLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->CoreModeSelect, 14, 15) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ClippedDrawingRectangleYMin, 16, 31) |
+      __gen_field(values->ClippedDrawingRectangleXMin, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClippedDrawingRectangleYMax, 16, 31) |
+      __gen_field(values->ClippedDrawingRectangleXMax, 0, 15) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DrawingRectangleOriginY, 16, 31) |
+      __gen_field(values->DrawingRectangleOriginX, 0, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_DS_length_bias 0x00000002
+#define GEN75_3DSTATE_DS_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 29,                  \
+   .DwordLength          =  4
+
+#define GEN75_3DSTATE_DS_length 0x00000006
+
+struct GEN75_3DSTATE_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleDomainPointDispatch;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         AccessesUAV;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     PatchURBEntryReadLength;
+   uint32_t                                     PatchURBEntryReadOffset;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         StatisticsEnable;
+   bool                                         ComputeWCoordinateEnable;
+   bool                                         DSCacheDisable;
+   bool                                         DSFunctionEnable;
+};
+
+static inline void
+GEN75_3DSTATE_DS_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SingleDomainPointDispatch, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->AccessesUAV, 14, 14) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 20, 24) |
+      __gen_field(values->PatchURBEntryReadLength, 11, 17) |
+      __gen_field(values->PatchURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[5] =
+      __gen_field(values->MaximumNumberofThreads, 21, 29) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->ComputeWCoordinateEnable, 2, 2) |
+      __gen_field(values->DSCacheDisable, 1, 1) |
+      __gen_field(values->DSFunctionEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_DS_length_bias 0x00000002
+#define GEN75_3DSTATE_GATHER_CONSTANT_DS_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 55
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_DS_length 0x00000000
+
+#define GEN75_GATHER_CONSTANT_ENTRY_length 0x00000001
+
+struct GEN75_GATHER_CONSTANT_ENTRY {
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ChannelMask;
+   uint32_t                                     BindingTableIndexOffset;
+};
+
+static inline void
+GEN75_GATHER_CONSTANT_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN75_GATHER_CONSTANT_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->ConstantBufferOffset, 8, 15) |
+      __gen_field(values->ChannelMask, 4, 7) |
+      __gen_field(values->BindingTableIndexOffset, 0, 3) |
+      0;
+
+}
+
+struct GEN75_3DSTATE_GATHER_CONSTANT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_GATHER_CONSTANT_DS_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_3DSTATE_GATHER_CONSTANT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_GS_length_bias 0x00000002
+#define GEN75_3DSTATE_GATHER_CONSTANT_GS_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 53
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_GS_length 0x00000000
+
+struct GEN75_3DSTATE_GATHER_CONSTANT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_GATHER_CONSTANT_GS_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_3DSTATE_GATHER_CONSTANT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_HS_length_bias 0x00000002
+#define GEN75_3DSTATE_GATHER_CONSTANT_HS_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 54
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_HS_length 0x00000000
+
+struct GEN75_3DSTATE_GATHER_CONSTANT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_GATHER_CONSTANT_HS_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_3DSTATE_GATHER_CONSTANT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_PS_length_bias 0x00000002
+#define GEN75_3DSTATE_GATHER_CONSTANT_PS_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 56
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_PS_length 0x00000000
+
+struct GEN75_3DSTATE_GATHER_CONSTANT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9Enable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_GATHER_CONSTANT_PS_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_3DSTATE_GATHER_CONSTANT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9Enable, 4, 4) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_VS_length_bias 0x00000002
+#define GEN75_3DSTATE_GATHER_CONSTANT_VS_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 52
+
+#define GEN75_3DSTATE_GATHER_CONSTANT_VS_length 0x00000000
+
+struct GEN75_3DSTATE_GATHER_CONSTANT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9Enable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_GATHER_CONSTANT_VS_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_3DSTATE_GATHER_CONSTANT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9Enable, 4, 4) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_GATHER_POOL_ALLOC_length_bias 0x00000002
+#define GEN75_3DSTATE_GATHER_POOL_ALLOC_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 26,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_GATHER_POOL_ALLOC_length 0x00000003
+
+struct GEN75_3DSTATE_GATHER_POOL_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GatherPoolBaseAddress;
+   bool                                         GatherPoolEnable;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     MemoryObjectControlState;
+   __gen_address_type                           GatherPoolUpperBound;
+};
+
+static inline void
+GEN75_3DSTATE_GATHER_POOL_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN75_3DSTATE_GATHER_POOL_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(values->GatherPoolEnable, 11, 11) |
+      __gen_mbo(4, 5) |
+      __gen_field(dw_MemoryObjectControlState, 0, 3) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->GatherPoolBaseAddress, dw1);
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->GatherPoolUpperBound, dw2);
+
+}
+
+#define GEN75_3DSTATE_GS_length_bias 0x00000002
+#define GEN75_3DSTATE_GS_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 17,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_GS_length 0x00000007
+
+struct GEN75_3DSTATE_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     KernelStartPointer;
+   uint32_t                                     SingleProgramFlowSPF;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnableVME;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     NormalPriority                                     0
+#define     HighPriority                                       1
+   uint32_t                                     ThreadPriority;
+#define     IEEE754                                            0
+#define     alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   uint32_t                                     GSaccessesUAV;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     OutputVertexSize;
+   uint32_t                                     OutputTopology;
+   uint32_t                                     VertexURBEntryReadLength;
+   bool                                         IncludeVertexHandles;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     DispatchGRFStartRegisterforURBData;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     ControlDataHeaderSize;
+   uint32_t                                     InstanceControl;
+   uint32_t                                     DefaultStreamID;
+#define     SINGLE                                             0
+#define     DUAL_INSTANCE                                      1
+#define     DUAL_OBJECT                                        2
+   uint32_t                                     DispatchMode;
+   uint32_t                                     GSStatisticsEnable;
+   uint32_t                                     GSInvocationsIncrementValue;
+   bool                                         IncludePrimitiveID;
+   uint32_t                                     Hint;
+#define     REORDER_LEADING                                    0
+#define     REORDER_TRAILING                                   1
+   uint32_t                                     ReorderMode;
+   bool                                         DiscardAdjacency;
+   bool                                         GSEnable;
+#define     GSCTL_CUT                                          0
+#define     GSCTL_SID                                          1
+   uint32_t                                     ControlDataFormat;
+   uint32_t                                     SemaphoreHandle;
+};
+
+static inline void
+GEN75_3DSTATE_GS_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SingleProgramFlowSPF, 31, 31) |
+      __gen_field(values->VectorMaskEnableVME, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->GSaccessesUAV, 12, 12) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->OutputVertexSize, 23, 28) |
+      __gen_field(values->OutputTopology, 17, 22) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->IncludeVertexHandles, 10, 10) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      __gen_field(values->DispatchGRFStartRegisterforURBData, 0, 3) |
+      0;
+
+   dw[5] =
+      __gen_field(values->MaximumNumberofThreads, 24, 31) |
+      __gen_field(values->ControlDataHeaderSize, 20, 23) |
+      __gen_field(values->InstanceControl, 15, 19) |
+      __gen_field(values->DefaultStreamID, 13, 14) |
+      __gen_field(values->DispatchMode, 11, 12) |
+      __gen_field(values->GSStatisticsEnable, 10, 10) |
+      __gen_field(values->GSInvocationsIncrementValue, 5, 9) |
+      __gen_field(values->IncludePrimitiveID, 4, 4) |
+      __gen_field(values->Hint, 3, 3) |
+      __gen_field(values->ReorderMode, 2, 2) |
+      __gen_field(values->DiscardAdjacency, 1, 1) |
+      __gen_field(values->GSEnable, 0, 0) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ControlDataFormat, 31, 31) |
+      __gen_offset(values->SemaphoreHandle, 0, 12) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_HIER_DEPTH_BUFFER_length_bias 0x00000002
+#define GEN75_3DSTATE_HIER_DEPTH_BUFFER_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  7,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_HIER_DEPTH_BUFFER_length 0x00000003
+
+struct GEN75_3DSTATE_HIER_DEPTH_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     HierarchicalDepthBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+};
+
+static inline void
+GEN75_3DSTATE_HIER_DEPTH_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN75_3DSTATE_HIER_DEPTH_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_HierarchicalDepthBufferObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_HierarchicalDepthBufferObjectControlState, &values->HierarchicalDepthBufferObjectControlState);
+   dw[1] =
+      __gen_field(dw_HierarchicalDepthBufferObjectControlState, 25, 28) |
+      __gen_field(values->SurfacePitch, 0, 16) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+}
+
+#define GEN75_3DSTATE_HS_length_bias 0x00000002
+#define GEN75_3DSTATE_HS_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 27,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_HS_length 0x00000007
+
+struct GEN75_3DSTATE_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         Enable;
+   bool                                         StatisticsEnable;
+   uint32_t                                     InstanceCount;
+   uint32_t                                     KernelStartPointer;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     SingleProgramFlow;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+   bool                                         HSaccessesUAV;
+   bool                                         IncludeVertexHandles;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     SemaphoreHandle;
+};
+
+static inline void
+GEN75_3DSTATE_HS_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 12, 12) |
+      __gen_field(values->MaximumNumberofThreads, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Enable, 31, 31) |
+      __gen_field(values->StatisticsEnable, 29, 29) |
+      __gen_field(values->InstanceCount, 0, 3) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[4] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[5] =
+      __gen_field(values->SingleProgramFlow, 27, 27) |
+      __gen_field(values->VectorMaskEnable, 26, 26) |
+      __gen_field(values->HSaccessesUAV, 25, 25) |
+      __gen_field(values->IncludeVertexHandles, 24, 24) |
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 19, 23) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[6] =
+      __gen_offset(values->SemaphoreHandle, 0, 12) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_INDEX_BUFFER_length_bias 0x00000002
+#define GEN75_3DSTATE_INDEX_BUFFER_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 10,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_INDEX_BUFFER_length 0x00000003
+
+struct GEN75_3DSTATE_INDEX_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     MemoryObjectControlState;
+#define     INDEX_BYTE                                         0
+#define     INDEX_WORD                                         1
+#define     INDEX_DWORD                                        2
+   uint32_t                                     IndexFormat;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BufferStartingAddress;
+   __gen_address_type                           BufferEndingAddress;
+};
+
+static inline void
+GEN75_3DSTATE_INDEX_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_3DSTATE_INDEX_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_MemoryObjectControlState, 12, 15) |
+      __gen_field(values->IndexFormat, 8, 9) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->BufferStartingAddress, dw1);
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->BufferEndingAddress, dw2);
+
+}
+
+#define GEN75_3DSTATE_LINE_STIPPLE_length_bias 0x00000002
+#define GEN75_3DSTATE_LINE_STIPPLE_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  8,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_LINE_STIPPLE_length 0x00000003
+
+struct GEN75_3DSTATE_LINE_STIPPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ModifyEnableCurrentRepeatCounterCurrentStippleIndex;
+   uint32_t                                     CurrentRepeatCounter;
+   uint32_t                                     CurrentStippleIndex;
+   uint32_t                                     LineStipplePattern;
+   float                                        LineStippleInverseRepeatCount;
+   uint32_t                                     LineStippleRepeatCount;
+};
+
+static inline void
+GEN75_3DSTATE_LINE_STIPPLE_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_3DSTATE_LINE_STIPPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ModifyEnableCurrentRepeatCounterCurrentStippleIndex, 31, 31) |
+      __gen_field(values->CurrentRepeatCounter, 21, 29) |
+      __gen_field(values->CurrentStippleIndex, 16, 19) |
+      __gen_field(values->LineStipplePattern, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->LineStippleInverseRepeatCount * (1 << 16), 15, 31) |
+      __gen_field(values->LineStippleRepeatCount, 0, 8) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_MONOFILTER_SIZE_length_bias 0x00000002
+#define GEN75_3DSTATE_MONOFILTER_SIZE_header    \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 17,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_MONOFILTER_SIZE_length 0x00000002
+
+struct GEN75_3DSTATE_MONOFILTER_SIZE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     MonochromeFilterWidth;
+   uint32_t                                     MonochromeFilterHeight;
+};
+
+static inline void
+GEN75_3DSTATE_MONOFILTER_SIZE_pack(__gen_user_data *data, void * restrict dst,
+                                   const struct GEN75_3DSTATE_MONOFILTER_SIZE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MonochromeFilterWidth, 3, 5) |
+      __gen_field(values->MonochromeFilterHeight, 0, 2) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_MULTISAMPLE_length_bias 0x00000002
+#define GEN75_3DSTATE_MULTISAMPLE_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 13,                  \
+   .DwordLength          =  2
+
+#define GEN75_3DSTATE_MULTISAMPLE_length 0x00000004
+
+struct GEN75_3DSTATE_MULTISAMPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         MultiSampleEnable;
+#define     PIXLOC_CENTER                                      0
+#define     PIXLOC_UL_CORNER                                   1
+   uint32_t                                     PixelLocation;
+#define     NUMSAMPLES_1                                       0
+#define     NUMSAMPLES_4                                       2
+#define     NUMSAMPLES_8                                       3
+   uint32_t                                     NumberofMultisamples;
+   float                                        Sample3XOffset;
+   float                                        Sample3YOffset;
+   float                                        Sample2XOffset;
+   float                                        Sample2YOffset;
+   float                                        Sample1XOffset;
+   float                                        Sample1YOffset;
+   float                                        Sample0XOffset;
+   float                                        Sample0YOffset;
+   float                                        Sample7XOffset;
+   float                                        Sample7YOffset;
+   float                                        Sample6XOffset;
+   float                                        Sample6YOffset;
+   float                                        Sample5XOffset;
+   float                                        Sample5YOffset;
+   float                                        Sample4XOffset;
+   float                                        Sample4YOffset;
+};
+
+static inline void
+GEN75_3DSTATE_MULTISAMPLE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_3DSTATE_MULTISAMPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MultiSampleEnable, 5, 5) |
+      __gen_field(values->PixelLocation, 4, 4) |
+      __gen_field(values->NumberofMultisamples, 1, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Sample3XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->Sample3YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->Sample2XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->Sample2YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->Sample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->Sample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->Sample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->Sample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Sample7XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->Sample7YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->Sample6XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->Sample6YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->Sample5XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->Sample5YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->Sample4XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->Sample4YOffset * (1 << 4), 0, 3) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_POLY_STIPPLE_OFFSET_length_bias 0x00000002
+#define GEN75_3DSTATE_POLY_STIPPLE_OFFSET_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  6,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_POLY_STIPPLE_OFFSET_length 0x00000002
+
+struct GEN75_3DSTATE_POLY_STIPPLE_OFFSET {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PolygonStippleXOffset;
+   uint32_t                                     PolygonStippleYOffset;
+};
+
+static inline void
+GEN75_3DSTATE_POLY_STIPPLE_OFFSET_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN75_3DSTATE_POLY_STIPPLE_OFFSET * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PolygonStippleXOffset, 8, 12) |
+      __gen_field(values->PolygonStippleYOffset, 0, 4) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_POLY_STIPPLE_PATTERN_length_bias 0x00000002
+#define GEN75_3DSTATE_POLY_STIPPLE_PATTERN_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  7,                  \
+   .DwordLength          = 31
+
+#define GEN75_3DSTATE_POLY_STIPPLE_PATTERN_length 0x00000021
+
+struct GEN75_3DSTATE_POLY_STIPPLE_PATTERN {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PatternRow[32];
+};
+
+static inline void
+GEN75_3DSTATE_POLY_STIPPLE_PATTERN_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN75_3DSTATE_POLY_STIPPLE_PATTERN * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 32; i += 1, j++) {
+      dw[j] =
+         __gen_field(values->PatternRow[i + 0], 0, 31) |
+         0;
+   }
+
+}
+
+#define GEN75_3DSTATE_PS_length_bias 0x00000002
+#define GEN75_3DSTATE_PS_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 32,                  \
+   .DwordLength          =  6
+
+#define GEN75_3DSTATE_PS_length 0x00000008
+
+struct GEN75_3DSTATE_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     KernelStartPointer0;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleProgramFlowSPF;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnableVME;
+   uint32_t                                     SamplerCount;
+#define     FTZ                                                0
+#define     RET                                                1
+   uint32_t                                     DenormalMode;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadPriority;
+#define     IEEE745                                            0
+#define     Alt                                                1
+   uint32_t                                     FloatingPointMode;
+#define     RTNE                                               0
+#define     RU                                                 1
+#define     RD                                                 2
+#define     RTZ                                                3
+   uint32_t                                     RoundingMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     SampleMask;
+   bool                                         PushConstantEnable;
+   bool                                         AttributeEnable;
+   bool                                         oMaskPresenttoRenderTarget;
+   bool                                         RenderTargetFastClearEnable;
+   bool                                         DualSourceBlendEnable;
+   bool                                         RenderTargetResolveEnable;
+   bool                                         PSAccessesUAV;
+#define     POSOFFSET_NONE                                     0
+#define     POSOFFSET_CENTROID                                 2
+#define     POSOFFSET_SAMPLE                                   3
+   uint32_t                                     PositionXYOffsetSelect;
+   bool                                         _32PixelDispatchEnable;
+   bool                                         _16PixelDispatchEnable;
+   bool                                         _8PixelDispatchEnable;
+   uint32_t                                     DispatchGRFStartRegisterforConstantSetupData0;
+   uint32_t                                     DispatchGRFStartRegisterforConstantSetupData1;
+   uint32_t                                     DispatchGRFStartRegisterforConstantSetupData2;
+   uint32_t                                     KernelStartPointer1;
+   uint32_t                                     KernelStartPointer2;
+};
+
+static inline void
+GEN75_3DSTATE_PS_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointer0, 6, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SingleProgramFlowSPF, 31, 31) |
+      __gen_field(values->VectorMaskEnableVME, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->DenormalMode, 26, 26) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->RoundingMode, 14, 15) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->MaximumNumberofThreads, 23, 31) |
+      __gen_field(values->SampleMask, 12, 19) |
+      __gen_field(values->PushConstantEnable, 11, 11) |
+      __gen_field(values->AttributeEnable, 10, 10) |
+      __gen_field(values->oMaskPresenttoRenderTarget, 9, 9) |
+      __gen_field(values->RenderTargetFastClearEnable, 8, 8) |
+      __gen_field(values->DualSourceBlendEnable, 7, 7) |
+      __gen_field(values->RenderTargetResolveEnable, 6, 6) |
+      __gen_field(values->PSAccessesUAV, 5, 5) |
+      __gen_field(values->PositionXYOffsetSelect, 3, 4) |
+      __gen_field(values->_32PixelDispatchEnable, 2, 2) |
+      __gen_field(values->_16PixelDispatchEnable, 1, 1) |
+      __gen_field(values->_8PixelDispatchEnable, 0, 0) |
+      0;
+
+   dw[5] =
+      __gen_field(values->DispatchGRFStartRegisterforConstantSetupData0, 16, 22) |
+      __gen_field(values->DispatchGRFStartRegisterforConstantSetupData1, 8, 14) |
+      __gen_field(values->DispatchGRFStartRegisterforConstantSetupData2, 0, 6) |
+      0;
+
+   dw[6] =
+      __gen_offset(values->KernelStartPointer1, 6, 31) |
+      0;
+
+   dw[7] =
+      __gen_offset(values->KernelStartPointer2, 6, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_DS_length_bias 0x00000002
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 20,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_DS_length 0x00000002
+
+struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_DS_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_GS_length_bias 0x00000002
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 21,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_GS_length 0x00000002
+
+struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_GS_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_HS_length_bias 0x00000002
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 19,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_HS_length 0x00000002
+
+struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_HS_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_PS_length_bias 0x00000002
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 22,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_PS_length 0x00000002
+
+struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_PS_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_VS_length_bias 0x00000002
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 18,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_VS_length 0x00000002
+
+struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_VS_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN75_3DSTATE_PUSH_CONSTANT_ALLOC_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_RAST_MULTISAMPLE_length_bias 0x00000002
+#define GEN75_3DSTATE_RAST_MULTISAMPLE_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 14,                  \
+   .DwordLength          =  4
+
+#define GEN75_3DSTATE_RAST_MULTISAMPLE_length 0x00000006
+
+struct GEN75_3DSTATE_RAST_MULTISAMPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     NRM_NUMRASTSAMPLES_1                               0
+#define     NRM_NUMRASTSAMPLES_2                               1
+#define     NRM_NUMRASTSAMPLES_4                               2
+#define     NRM_NUMRASTSAMPLES_8                               3
+#define     NRM_NUMRASTSAMPLES_16                              4
+   uint32_t                                     NumberofRasterizationMultisamples;
+   float                                        Sample3XOffset;
+   float                                        Sample3YOffset;
+   float                                        Sample2XOffset;
+   float                                        Sample2YOffset;
+   float                                        Sample1XOffset;
+   float                                        Sample1YOffset;
+   float                                        Sample0XOffset;
+   float                                        Sample0YOffset;
+   float                                        Sample7XOffset;
+   float                                        Sample7YOffset;
+   float                                        Sample6XOffset;
+   float                                        Sample6YOffset;
+   float                                        Sample5XOffset;
+   float                                        Sample5YOffset;
+   float                                        Sample4XOffset;
+   float                                        Sample4YOffset;
+   float                                        Sample11XOffset;
+   float                                        Sample11YOffset;
+   float                                        Sample10XOffset;
+   float                                        Sample10YOffset;
+   float                                        Sample9XOffset;
+   float                                        Sample9YOffset;
+   float                                        Sample8XOffset;
+   float                                        Sample8YOffset;
+   float                                        Sample15XOffset;
+   float                                        Sample15YOffset;
+   float                                        Sample14XOffset;
+   float                                        Sample14YOffset;
+   float                                        Sample13XOffset;
+   float                                        Sample13YOffset;
+   float                                        Sample12XOffset;
+   float                                        Sample12YOffset;
+};
+
+static inline void
+GEN75_3DSTATE_RAST_MULTISAMPLE_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN75_3DSTATE_RAST_MULTISAMPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->NumberofRasterizationMultisamples, 1, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Sample3XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->Sample3YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->Sample2XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->Sample2YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->Sample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->Sample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->Sample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->Sample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Sample7XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->Sample7YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->Sample6XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->Sample6YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->Sample5XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->Sample5YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->Sample4XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->Sample4YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->Sample11XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->Sample11YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->Sample10XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->Sample10YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->Sample9XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->Sample9YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->Sample8XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->Sample8YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[5] =
+      __gen_field(values->Sample15XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->Sample15YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->Sample14XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->Sample14YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->Sample13XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->Sample13YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->Sample12XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->Sample12YOffset * (1 << 4), 0, 3) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SAMPLER_PALETTE_LOAD0_length_bias 0x00000002
+#define GEN75_3DSTATE_SAMPLER_PALETTE_LOAD0_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  2
+
+#define GEN75_3DSTATE_SAMPLER_PALETTE_LOAD0_length 0x00000000
+
+#define GEN75_PALETTE_ENTRY_length 0x00000001
+
+struct GEN75_PALETTE_ENTRY {
+   uint32_t                                     Alpha;
+   uint32_t                                     Red;
+   uint32_t                                     Green;
+   uint32_t                                     Blue;
+};
+
+static inline void
+GEN75_PALETTE_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN75_PALETTE_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->Alpha, 24, 31) |
+      __gen_field(values->Red, 16, 23) |
+      __gen_field(values->Green, 8, 15) |
+      __gen_field(values->Blue, 0, 7) |
+      0;
+
+}
+
+struct GEN75_3DSTATE_SAMPLER_PALETTE_LOAD0 {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_SAMPLER_PALETTE_LOAD0_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN75_3DSTATE_SAMPLER_PALETTE_LOAD0 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_SAMPLER_PALETTE_LOAD1_length_bias 0x00000002
+#define GEN75_3DSTATE_SAMPLER_PALETTE_LOAD1_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 12
+
+#define GEN75_3DSTATE_SAMPLER_PALETTE_LOAD1_length 0x00000000
+
+struct GEN75_3DSTATE_SAMPLER_PALETTE_LOAD1 {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_SAMPLER_PALETTE_LOAD1_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN75_3DSTATE_SAMPLER_PALETTE_LOAD1 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_DS_length_bias 0x00000002
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 45,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_DS_length 0x00000002
+
+struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDSSamplerState;
+};
+
+static inline void
+GEN75_3DSTATE_SAMPLER_STATE_POINTERS_DS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_GS_length_bias 0x00000002
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 46,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_GS_length 0x00000002
+
+struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoGSSamplerState;
+};
+
+static inline void
+GEN75_3DSTATE_SAMPLER_STATE_POINTERS_GS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoGSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_HS_length_bias 0x00000002
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 44,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_HS_length 0x00000002
+
+struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoHSSamplerState;
+};
+
+static inline void
+GEN75_3DSTATE_SAMPLER_STATE_POINTERS_HS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoHSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_PS_length_bias 0x00000002
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 47,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_PS_length 0x00000002
+
+struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoPSSamplerState;
+};
+
+static inline void
+GEN75_3DSTATE_SAMPLER_STATE_POINTERS_PS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoPSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_VS_length_bias 0x00000002
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 43,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_SAMPLER_STATE_POINTERS_VS_length 0x00000002
+
+struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoVSSamplerState;
+};
+
+static inline void
+GEN75_3DSTATE_SAMPLER_STATE_POINTERS_VS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN75_3DSTATE_SAMPLER_STATE_POINTERS_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoVSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SAMPLE_MASK_length_bias 0x00000002
+#define GEN75_3DSTATE_SAMPLE_MASK_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 24,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_SAMPLE_MASK_length 0x00000002
+
+struct GEN75_3DSTATE_SAMPLE_MASK {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SampleMask;
+};
+
+static inline void
+GEN75_3DSTATE_SAMPLE_MASK_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_3DSTATE_SAMPLE_MASK * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SampleMask, 0, 7) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SBE_length_bias 0x00000002
+#define GEN75_3DSTATE_SBE_header                \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 31,                  \
+   .DwordLength          = 12
+
+#define GEN75_3DSTATE_SBE_length 0x0000000e
+
+struct GEN75_3DSTATE_SBE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     AttributeSwizzleControlMode;
+   uint32_t                                     NumberofSFOutputAttributes;
+   bool                                         AttributeSwizzleEnable;
+#define     UPPERLEFT                                          0
+#define     LOWERLEFT                                          1
+   uint32_t                                     PointSpriteTextureCoordinateOrigin;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   bool                                         Attribute2n1ComponentOverrideW;
+   bool                                         Attribute2n1ComponentOverrideZ;
+   bool                                         Attribute2n1ComponentOverrideY;
+   bool                                         Attribute2n1ComponentOverrideX;
+#define     CONST_0000                                         0
+#define     CONST_0001_FLOAT                                   1
+#define     CONST_1111_FLOAT                                   2
+#define     PRIM_ID                                            3
+   uint32_t                                     Attribute2n1ConstantSource;
+#define     INPUTATTR                                          0
+#define     INPUTATTR_FACING                                   1
+#define     INPUTATTR_W                                        2
+#define     INPUTATTR_FACING_W                                 3
+   uint32_t                                     Attribute2n1SwizzleSelect;
+   uint32_t                                     Attribute2n1SourceAttribute;
+   bool                                         Attribute2nComponentOverrideW;
+   bool                                         Attribute2nComponentOverrideZ;
+   bool                                         Attribute2nComponentOverrideY;
+   bool                                         Attribute2nComponentOverrideX;
+#define     CONST_0000                                         0
+#define     CONST_0001_FLOAT                                   1
+#define     CONST_1111_FLOAT                                   2
+#define     PRIM_ID                                            3
+   uint32_t                                     Attribute2nConstantSource;
+#define     INPUTATTR                                          0
+#define     INPUTATTR_FACING                                   1
+#define     INPUTATTR_W                                        2
+#define     INPUTATTR_FACING_W                                 3
+   uint32_t                                     Attribute2nSwizzleSelect;
+   uint32_t                                     Attribute2nSourceAttribute;
+   uint32_t                                     PointSpriteTextureCoordinateEnable;
+   uint32_t                                     ConstantInterpolationEnable310;
+   uint32_t                                     Attribute7WrapShortestEnables;
+   uint32_t                                     Attribute6WrapShortestEnables;
+   uint32_t                                     Attribute5WrapShortestEnables;
+   uint32_t                                     Attribute4WrapShortestEnables;
+   uint32_t                                     Attribute3WrapShortestEnables;
+   uint32_t                                     Attribute2WrapShortestEnables;
+   uint32_t                                     Attribute1WrapShortestEnables;
+   uint32_t                                     Attribute0WrapShortestEnables;
+   uint32_t                                     Attribute15WrapShortestEnables;
+   uint32_t                                     Attribute14WrapShortestEnables;
+   uint32_t                                     Attribute13WrapShortestEnables;
+   uint32_t                                     Attribute12WrapShortestEnables;
+   uint32_t                                     Attribute11WrapShortestEnables;
+   uint32_t                                     Attribute10WrapShortestEnables;
+   uint32_t                                     Attribute9WrapShortestEnables;
+   uint32_t                                     Attribute8WrapShortestEnables;
+};
+
+static inline void
+GEN75_3DSTATE_SBE_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN75_3DSTATE_SBE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AttributeSwizzleControlMode, 28, 28) |
+      __gen_field(values->NumberofSFOutputAttributes, 22, 27) |
+      __gen_field(values->AttributeSwizzleEnable, 21, 21) |
+      __gen_field(values->PointSpriteTextureCoordinateOrigin, 20, 20) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 15) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Attribute2n1ComponentOverrideW, 31, 31) |
+      __gen_field(values->Attribute2n1ComponentOverrideZ, 30, 30) |
+      __gen_field(values->Attribute2n1ComponentOverrideY, 29, 29) |
+      __gen_field(values->Attribute2n1ComponentOverrideX, 28, 28) |
+      __gen_field(values->Attribute2n1ConstantSource, 25, 26) |
+      __gen_field(values->Attribute2n1SwizzleSelect, 22, 23) |
+      __gen_field(values->Attribute2n1SourceAttribute, 16, 20) |
+      __gen_field(values->Attribute2nComponentOverrideW, 15, 15) |
+      __gen_field(values->Attribute2nComponentOverrideZ, 14, 14) |
+      __gen_field(values->Attribute2nComponentOverrideY, 13, 13) |
+      __gen_field(values->Attribute2nComponentOverrideX, 12, 12) |
+      __gen_field(values->Attribute2nConstantSource, 9, 10) |
+      __gen_field(values->Attribute2nSwizzleSelect, 6, 7) |
+      __gen_field(values->Attribute2nSourceAttribute, 0, 4) |
+      0;
+
+   dw[10] =
+      __gen_field(values->PointSpriteTextureCoordinateEnable, 0, 31) |
+      0;
+
+   dw[11] =
+      __gen_field(values->ConstantInterpolationEnable310, 0, 31) |
+      0;
+
+   dw[12] =
+      __gen_field(values->Attribute7WrapShortestEnables, 28, 31) |
+      __gen_field(values->Attribute6WrapShortestEnables, 24, 27) |
+      __gen_field(values->Attribute5WrapShortestEnables, 20, 23) |
+      __gen_field(values->Attribute4WrapShortestEnables, 16, 19) |
+      __gen_field(values->Attribute3WrapShortestEnables, 12, 15) |
+      __gen_field(values->Attribute2WrapShortestEnables, 8, 11) |
+      __gen_field(values->Attribute1WrapShortestEnables, 4, 7) |
+      __gen_field(values->Attribute0WrapShortestEnables, 0, 3) |
+      0;
+
+   dw[13] =
+      __gen_field(values->Attribute15WrapShortestEnables, 28, 31) |
+      __gen_field(values->Attribute14WrapShortestEnables, 24, 27) |
+      __gen_field(values->Attribute13WrapShortestEnables, 20, 23) |
+      __gen_field(values->Attribute12WrapShortestEnables, 16, 19) |
+      __gen_field(values->Attribute11WrapShortestEnables, 12, 15) |
+      __gen_field(values->Attribute10WrapShortestEnables, 8, 11) |
+      __gen_field(values->Attribute9WrapShortestEnables, 4, 7) |
+      __gen_field(values->Attribute8WrapShortestEnables, 0, 3) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SCISSOR_STATE_POINTERS_length_bias 0x00000002
+#define GEN75_3DSTATE_SCISSOR_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 15,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_SCISSOR_STATE_POINTERS_length 0x00000002
+
+struct GEN75_3DSTATE_SCISSOR_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ScissorRectPointer;
+};
+
+static inline void
+GEN75_3DSTATE_SCISSOR_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN75_3DSTATE_SCISSOR_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ScissorRectPointer, 5, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SF_length_bias 0x00000002
+#define GEN75_3DSTATE_SF_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 19,                  \
+   .DwordLength          =  5
+
+#define GEN75_3DSTATE_SF_length 0x00000007
+
+struct GEN75_3DSTATE_SF {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     D32_FLOAT_S8X24_UINT                               0
+#define     D32_FLOAT                                          1
+#define     D24_UNORM_S8_UINT                                  2
+#define     D24_UNORM_X8_UINT                                  3
+#define     D16_UNORM                                          5
+   uint32_t                                     DepthBufferSurfaceFormat;
+   bool                                         LegacyGlobalDepthBiasEnable;
+   bool                                         StatisticsEnable;
+   bool                                         GlobalDepthOffsetEnableSolid;
+   bool                                         GlobalDepthOffsetEnableWireframe;
+   bool                                         GlobalDepthOffsetEnablePoint;
+#define     RASTER_SOLID                                       0
+#define     RASTER_WIREFRAME                                   1
+#define     RASTER_POINT                                       2
+   uint32_t                                     FrontFaceFillMode;
+#define     RASTER_SOLID                                       0
+#define     RASTER_WIREFRAME                                   1
+#define     RASTER_POINT                                       2
+   uint32_t                                     BackFaceFillMode;
+   bool                                         ViewTransformEnable;
+   uint32_t                                     FrontWinding;
+   bool                                         AntiAliasingEnable;
+#define     CULLMODE_BOTH                                      0
+#define     CULLMODE_NONE                                      1
+#define     CULLMODE_FRONT                                     2
+#define     CULLMODE_BACK                                      3
+   uint32_t                                     CullMode;
+   float                                        LineWidth;
+   uint32_t                                     LineEndCapAntialiasingRegionWidth;
+   bool                                         LineStippleEnable;
+   bool                                         ScissorRectangleEnable;
+   bool                                         RTIndependentRasterizationEnable;
+   uint32_t                                     MultisampleRasterizationMode;
+   bool                                         LastPixelEnable;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+#define     Vertex2                                            2
+   uint32_t                                     TriangleStripListProvokingVertexSelect;
+   uint32_t                                     LineStripListProvokingVertexSelect;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+#define     Vertex2                                            2
+   uint32_t                                     TriangleFanProvokingVertexSelect;
+#define     AALINEDISTANCE_TRUE                                1
+   uint32_t                                     AALineDistanceMode;
+   uint32_t                                     VertexSubPixelPrecisionSelect;
+   uint32_t                                     UsePointWidthState;
+   float                                        PointWidth;
+   float                                        GlobalDepthOffsetConstant;
+   float                                        GlobalDepthOffsetScale;
+   float                                        GlobalDepthOffsetClamp;
+};
+
+static inline void
+GEN75_3DSTATE_SF_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_SF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DepthBufferSurfaceFormat, 12, 14) |
+      __gen_field(values->LegacyGlobalDepthBiasEnable, 11, 11) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->GlobalDepthOffsetEnableSolid, 9, 9) |
+      __gen_field(values->GlobalDepthOffsetEnableWireframe, 8, 8) |
+      __gen_field(values->GlobalDepthOffsetEnablePoint, 7, 7) |
+      __gen_field(values->FrontFaceFillMode, 5, 6) |
+      __gen_field(values->BackFaceFillMode, 3, 4) |
+      __gen_field(values->ViewTransformEnable, 1, 1) |
+      __gen_field(values->FrontWinding, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_field(values->AntiAliasingEnable, 31, 31) |
+      __gen_field(values->CullMode, 29, 30) |
+      __gen_field(values->LineWidth * (1 << 7), 18, 27) |
+      __gen_field(values->LineEndCapAntialiasingRegionWidth, 16, 17) |
+      __gen_field(values->LineStippleEnable, 14, 14) |
+      __gen_field(values->ScissorRectangleEnable, 11, 11) |
+      __gen_field(values->RTIndependentRasterizationEnable, 10, 10) |
+      __gen_field(values->MultisampleRasterizationMode, 8, 9) |
+      0;
+
+   dw[3] =
+      __gen_field(values->LastPixelEnable, 31, 31) |
+      __gen_field(values->TriangleStripListProvokingVertexSelect, 29, 30) |
+      __gen_field(values->LineStripListProvokingVertexSelect, 27, 28) |
+      __gen_field(values->TriangleFanProvokingVertexSelect, 25, 26) |
+      __gen_field(values->AALineDistanceMode, 14, 14) |
+      __gen_field(values->VertexSubPixelPrecisionSelect, 12, 12) |
+      __gen_field(values->UsePointWidthState, 11, 11) |
+      __gen_field(values->PointWidth * (1 << 3), 0, 10) |
+      0;
+
+   dw[4] =
+      __gen_float(values->GlobalDepthOffsetConstant) |
+      0;
+
+   dw[5] =
+      __gen_float(values->GlobalDepthOffsetScale) |
+      0;
+
+   dw[6] =
+      __gen_float(values->GlobalDepthOffsetClamp) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_SO_BUFFER_length_bias 0x00000002
+#define GEN75_3DSTATE_SO_BUFFER_header          \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 24,                  \
+   .DwordLength          =  2
+
+#define GEN75_3DSTATE_SO_BUFFER_length 0x00000004
+
+struct GEN75_3DSTATE_SO_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SOBufferIndex;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     SOBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   __gen_address_type                           SurfaceEndAddress;
+};
+
+static inline void
+GEN75_3DSTATE_SO_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN75_3DSTATE_SO_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SOBufferObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SOBufferObjectControlState, &values->SOBufferObjectControlState);
+   dw[1] =
+      __gen_field(values->SOBufferIndex, 29, 30) |
+      __gen_field(dw_SOBufferObjectControlState, 25, 28) |
+      __gen_field(values->SurfacePitch, 0, 11) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->SurfaceEndAddress, dw3);
+
+}
+
+#define GEN75_3DSTATE_SO_DECL_LIST_length_bias 0x00000002
+#define GEN75_3DSTATE_SO_DECL_LIST_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 23
+
+#define GEN75_3DSTATE_SO_DECL_LIST_length 0x00000000
+
+#define GEN75_SO_DECL_ENTRY_length 0x00000002
+
+#define GEN75_SO_DECL_length 0x00000001
+
+struct GEN75_SO_DECL {
+   uint32_t                                     OutputBufferSlot;
+   uint32_t                                     HoleFlag;
+   uint32_t                                     RegisterIndex;
+   uint32_t                                     ComponentMask;
+};
+
+static inline void
+GEN75_SO_DECL_pack(__gen_user_data *data, void * restrict dst,
+                   const struct GEN75_SO_DECL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->OutputBufferSlot, 12, 13) |
+      __gen_field(values->HoleFlag, 11, 11) |
+      __gen_field(values->RegisterIndex, 4, 9) |
+      __gen_field(values->ComponentMask, 0, 3) |
+      0;
+
+}
+
+struct GEN75_SO_DECL_ENTRY {
+   struct GEN75_SO_DECL                         Stream3Decl;
+   struct GEN75_SO_DECL                         Stream2Decl;
+   struct GEN75_SO_DECL                         Stream1Decl;
+   struct GEN75_SO_DECL                         Stream0Decl;
+};
+
+static inline void
+GEN75_SO_DECL_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN75_SO_DECL_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_Stream3Decl;
+   GEN75_SO_DECL_pack(data, &dw_Stream3Decl, &values->Stream3Decl);
+   uint32_t dw_Stream2Decl;
+   GEN75_SO_DECL_pack(data, &dw_Stream2Decl, &values->Stream2Decl);
+   uint32_t dw_Stream1Decl;
+   GEN75_SO_DECL_pack(data, &dw_Stream1Decl, &values->Stream1Decl);
+   uint32_t dw_Stream0Decl;
+   GEN75_SO_DECL_pack(data, &dw_Stream0Decl, &values->Stream0Decl);
+   uint64_t qw0 =
+      __gen_field(dw_Stream3Decl, 48, 63) |
+      __gen_field(dw_Stream2Decl, 32, 47) |
+      __gen_field(dw_Stream1Decl, 16, 31) |
+      __gen_field(dw_Stream0Decl, 0, 15) |
+      0;
+
+   dw[0] = qw0;
+   dw[1] = qw0 >> 32;
+
+}
+
+struct GEN75_3DSTATE_SO_DECL_LIST {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StreamtoBufferSelects3;
+   uint32_t                                     StreamtoBufferSelects2;
+   uint32_t                                     StreamtoBufferSelects1;
+   uint32_t                                     StreamtoBufferSelects0;
+   uint32_t                                     NumEntries3;
+   uint32_t                                     NumEntries2;
+   uint32_t                                     NumEntries1;
+   uint32_t                                     NumEntries0;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_SO_DECL_LIST_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_3DSTATE_SO_DECL_LIST * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StreamtoBufferSelects3, 12, 15) |
+      __gen_field(values->StreamtoBufferSelects2, 8, 11) |
+      __gen_field(values->StreamtoBufferSelects1, 4, 7) |
+      __gen_field(values->StreamtoBufferSelects0, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->NumEntries3, 24, 31) |
+      __gen_field(values->NumEntries2, 16, 23) |
+      __gen_field(values->NumEntries1, 8, 15) |
+      __gen_field(values->NumEntries0, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_STENCIL_BUFFER_length_bias 0x00000002
+#define GEN75_3DSTATE_STENCIL_BUFFER_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  6,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_STENCIL_BUFFER_length 0x00000003
+
+struct GEN75_3DSTATE_STENCIL_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StencilBufferEnable;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     StencilBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+};
+
+static inline void
+GEN75_3DSTATE_STENCIL_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN75_3DSTATE_STENCIL_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_StencilBufferObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_StencilBufferObjectControlState, &values->StencilBufferObjectControlState);
+   dw[1] =
+      __gen_field(values->StencilBufferEnable, 31, 31) |
+      __gen_field(dw_StencilBufferObjectControlState, 25, 28) |
+      __gen_field(values->SurfacePitch, 0, 16) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+}
+
+#define GEN75_3DSTATE_STREAMOUT_length_bias 0x00000002
+#define GEN75_3DSTATE_STREAMOUT_header          \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 30,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_STREAMOUT_length 0x00000003
+
+struct GEN75_3DSTATE_STREAMOUT {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SOFunctionEnable;
+   uint32_t                                     RenderingDisable;
+   uint32_t                                     RenderStreamSelect;
+#define     LEADING                                            0
+#define     TRAILING                                           1
+   uint32_t                                     ReorderMode;
+   bool                                         SOStatisticsEnable;
+   uint32_t                                     SOBufferEnable3;
+   uint32_t                                     SOBufferEnable2;
+   uint32_t                                     SOBufferEnable1;
+   uint32_t                                     SOBufferEnable0;
+   uint32_t                                     Stream3VertexReadOffset;
+   uint32_t                                     Stream3VertexReadLength;
+   uint32_t                                     Stream2VertexReadOffset;
+   uint32_t                                     Stream2VertexReadLength;
+   uint32_t                                     Stream1VertexReadOffset;
+   uint32_t                                     Stream1VertexReadLength;
+   uint32_t                                     Stream0VertexReadOffset;
+   uint32_t                                     Stream0VertexReadLength;
+};
+
+static inline void
+GEN75_3DSTATE_STREAMOUT_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN75_3DSTATE_STREAMOUT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SOFunctionEnable, 31, 31) |
+      __gen_field(values->RenderingDisable, 30, 30) |
+      __gen_field(values->RenderStreamSelect, 27, 28) |
+      __gen_field(values->ReorderMode, 26, 26) |
+      __gen_field(values->SOStatisticsEnable, 25, 25) |
+      __gen_field(values->SOBufferEnable3, 11, 11) |
+      __gen_field(values->SOBufferEnable2, 10, 10) |
+      __gen_field(values->SOBufferEnable1, 9, 9) |
+      __gen_field(values->SOBufferEnable0, 8, 8) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Stream3VertexReadOffset, 29, 29) |
+      __gen_field(values->Stream3VertexReadLength, 24, 28) |
+      __gen_field(values->Stream2VertexReadOffset, 21, 21) |
+      __gen_field(values->Stream2VertexReadLength, 16, 20) |
+      __gen_field(values->Stream1VertexReadOffset, 13, 13) |
+      __gen_field(values->Stream1VertexReadLength, 8, 12) |
+      __gen_field(values->Stream0VertexReadOffset, 5, 5) |
+      __gen_field(values->Stream0VertexReadLength, 0, 4) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_TE_length_bias 0x00000002
+#define GEN75_3DSTATE_TE_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 28,                  \
+   .DwordLength          =  2
+
+#define GEN75_3DSTATE_TE_length 0x00000004
+
+struct GEN75_3DSTATE_TE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     INTEGER                                            0
+#define     ODD_FRACTIONAL                                     1
+#define     EVEN_FRACTIONAL                                    2
+   uint32_t                                     Partitioning;
+#define     POINT                                              0
+#define     OUTPUT_LINE                                        1
+#define     OUTPUT_TRI_CW                                      2
+#define     OUTPUT_TRI_CCW                                     3
+   uint32_t                                     OutputTopology;
+#define     QUAD                                               0
+#define     TRI                                                1
+#define     ISOLINE                                            2
+   uint32_t                                     TEDomain;
+#define     HW_TESS                                            0
+#define     SW_TESS                                            1
+   uint32_t                                     TEMode;
+   bool                                         TEEnable;
+   float                                        MaximumTessellationFactorOdd;
+   float                                        MaximumTessellationFactorNotOdd;
+};
+
+static inline void
+GEN75_3DSTATE_TE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_TE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Partitioning, 12, 13) |
+      __gen_field(values->OutputTopology, 8, 9) |
+      __gen_field(values->TEDomain, 4, 5) |
+      __gen_field(values->TEMode, 1, 2) |
+      __gen_field(values->TEEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_float(values->MaximumTessellationFactorOdd) |
+      0;
+
+   dw[3] =
+      __gen_float(values->MaximumTessellationFactorNotOdd) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_URB_DS_length_bias 0x00000002
+#define GEN75_3DSTATE_URB_DS_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 50,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_URB_DS_length 0x00000002
+
+struct GEN75_3DSTATE_URB_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     DSURBStartingAddress;
+   uint32_t                                     DSURBEntryAllocationSize;
+   uint32_t                                     DSNumberofURBEntries;
+};
+
+static inline void
+GEN75_3DSTATE_URB_DS_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN75_3DSTATE_URB_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DSURBStartingAddress, 25, 30) |
+      __gen_field(values->DSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->DSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_URB_GS_length_bias 0x00000002
+#define GEN75_3DSTATE_URB_GS_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 51,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_URB_GS_length 0x00000002
+
+struct GEN75_3DSTATE_URB_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     GSURBStartingAddress;
+   uint32_t                                     GSURBEntryAllocationSize;
+   uint32_t                                     GSNumberofURBEntries;
+};
+
+static inline void
+GEN75_3DSTATE_URB_GS_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN75_3DSTATE_URB_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->GSURBStartingAddress, 25, 30) |
+      __gen_field(values->GSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->GSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_URB_HS_length_bias 0x00000002
+#define GEN75_3DSTATE_URB_HS_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 49,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_URB_HS_length 0x00000002
+
+struct GEN75_3DSTATE_URB_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     HSURBStartingAddress;
+   uint32_t                                     HSURBEntryAllocationSize;
+   uint32_t                                     HSNumberofURBEntries;
+};
+
+static inline void
+GEN75_3DSTATE_URB_HS_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN75_3DSTATE_URB_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->HSURBStartingAddress, 25, 30) |
+      __gen_field(values->HSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->HSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_VERTEX_BUFFERS_length_bias 0x00000002
+#define GEN75_3DSTATE_VERTEX_BUFFERS_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  8
+
+#define GEN75_3DSTATE_VERTEX_BUFFERS_length 0x00000000
+
+#define GEN75_VERTEX_BUFFER_STATE_length 0x00000004
+
+struct GEN75_VERTEX_BUFFER_STATE {
+   uint32_t                                     VertexBufferIndex;
+#define     VERTEXDATA                                         0
+#define     INSTANCEDATA                                       1
+   uint32_t                                     BufferAccessType;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     VertexBufferMemoryObjectControlState;
+   uint32_t                                     AddressModifyEnable;
+   bool                                         NullVertexBuffer;
+   uint32_t                                     VertexFetchInvalidate;
+   uint32_t                                     BufferPitch;
+   __gen_address_type                           BufferStartingAddress;
+   __gen_address_type                           EndAddress;
+   uint32_t                                     InstanceDataStepRate;
+};
+
+static inline void
+GEN75_VERTEX_BUFFER_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_VERTEX_BUFFER_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_VertexBufferMemoryObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_VertexBufferMemoryObjectControlState, &values->VertexBufferMemoryObjectControlState);
+   dw[0] =
+      __gen_field(values->VertexBufferIndex, 26, 31) |
+      __gen_field(values->BufferAccessType, 20, 20) |
+      __gen_field(dw_VertexBufferMemoryObjectControlState, 16, 19) |
+      __gen_field(values->AddressModifyEnable, 14, 14) |
+      __gen_field(values->NullVertexBuffer, 13, 13) |
+      __gen_field(values->VertexFetchInvalidate, 12, 12) |
+      __gen_field(values->BufferPitch, 0, 11) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->BufferStartingAddress, dw1);
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->EndAddress, dw2);
+
+   dw[3] =
+      __gen_field(values->InstanceDataStepRate, 0, 31) |
+      0;
+
+}
+
+struct GEN75_3DSTATE_VERTEX_BUFFERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_VERTEX_BUFFERS_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN75_3DSTATE_VERTEX_BUFFERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_VERTEX_ELEMENTS_length_bias 0x00000002
+#define GEN75_3DSTATE_VERTEX_ELEMENTS_header    \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  9
+
+#define GEN75_3DSTATE_VERTEX_ELEMENTS_length 0x00000000
+
+#define GEN75_VERTEX_ELEMENT_STATE_length 0x00000002
+
+struct GEN75_VERTEX_ELEMENT_STATE {
+   uint32_t                                     VertexBufferIndex;
+   bool                                         Valid;
+   uint32_t                                     SourceElementFormat;
+   bool                                         EdgeFlagEnable;
+   uint32_t                                     SourceElementOffset;
+   uint32_t                                     Component0Control;
+   uint32_t                                     Component1Control;
+   uint32_t                                     Component2Control;
+   uint32_t                                     Component3Control;
+};
+
+static inline void
+GEN75_VERTEX_ELEMENT_STATE_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_VERTEX_ELEMENT_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->VertexBufferIndex, 26, 31) |
+      __gen_field(values->Valid, 25, 25) |
+      __gen_field(values->SourceElementFormat, 16, 24) |
+      __gen_field(values->EdgeFlagEnable, 15, 15) |
+      __gen_field(values->SourceElementOffset, 0, 11) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Component0Control, 28, 30) |
+      __gen_field(values->Component1Control, 24, 26) |
+      __gen_field(values->Component2Control, 20, 22) |
+      __gen_field(values->Component3Control, 16, 18) |
+      0;
+
+}
+
+struct GEN75_3DSTATE_VERTEX_ELEMENTS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_3DSTATE_VERTEX_ELEMENTS_pack(__gen_user_data *data, void * restrict dst,
+                                   const struct GEN75_3DSTATE_VERTEX_ELEMENTS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_3DSTATE_VF_length_bias 0x00000002
+#define GEN75_3DSTATE_VF_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 12,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_VF_length 0x00000002
+
+struct GEN75_3DSTATE_VF {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         IndexedDrawCutIndexEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CutIndex;
+};
+
+static inline void
+GEN75_3DSTATE_VF_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_VF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->IndexedDrawCutIndexEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->CutIndex, 0, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_VF_STATISTICS_length_bias 0x00000001
+#define GEN75_3DSTATE_VF_STATISTICS_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  1,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 11
+
+#define GEN75_3DSTATE_VF_STATISTICS_length 0x00000001
+
+struct GEN75_3DSTATE_VF_STATISTICS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         StatisticsEnable;
+};
+
+static inline void
+GEN75_3DSTATE_VF_STATISTICS_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN75_3DSTATE_VF_STATISTICS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->StatisticsEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_CC_length_bias 0x00000002
+#define GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_CC_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 35,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_CC_length 0x00000002
+
+struct GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_CC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CCViewportPointer;
+};
+
+static inline void
+GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_CC_pack(__gen_user_data *data, void * restrict dst,
+                                              const struct GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_CC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->CCViewportPointer, 5, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_length_bias 0x00000002
+#define GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 33,                  \
+   .DwordLength          =  0
+
+#define GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_length 0x00000002
+
+struct GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SFClipViewportPointer;
+};
+
+static inline void
+GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_pack(__gen_user_data *data, void * restrict dst,
+                                                   const struct GEN75_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SFClipViewportPointer, 6, 31) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_VS_length_bias 0x00000002
+#define GEN75_3DSTATE_VS_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 16,                  \
+   .DwordLength          =  4
+
+#define GEN75_3DSTATE_VS_length 0x00000006
+
+struct GEN75_3DSTATE_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleVertexDispatch;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnableVME;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     NormalPriority                                     0
+#define     HighPriority                                       1
+   uint32_t                                     ThreadPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         VSaccessesUAV;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ScratchSpaceBaseOffset;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterforURBData;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         StatisticsEnable;
+   bool                                         VertexCacheDisable;
+   bool                                         VSFunctionEnable;
+};
+
+static inline void
+GEN75_3DSTATE_VS_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SingleVertexDispatch, 31, 31) |
+      __gen_field(values->VectorMaskEnableVME, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->VSaccessesUAV, 12, 12) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->ScratchSpaceBaseOffset, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->DispatchGRFStartRegisterforURBData, 20, 24) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[5] =
+      __gen_field(values->MaximumNumberofThreads, 23, 31) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->VertexCacheDisable, 1, 1) |
+      __gen_field(values->VSFunctionEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN75_3DSTATE_WM_length_bias 0x00000002
+#define GEN75_3DSTATE_WM_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 20,                  \
+   .DwordLength          =  1
+
+#define GEN75_3DSTATE_WM_length 0x00000003
+
+struct GEN75_3DSTATE_WM {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         StatisticsEnable;
+   bool                                         DepthBufferClear;
+   bool                                         ThreadDispatchEnable;
+   bool                                         DepthBufferResolveEnable;
+   bool                                         HierarchicalDepthBufferResolveEnable;
+   bool                                         LegacyDiamondLineRasterization;
+   bool                                         PixelShaderKillPixel;
+#define     PSCDEPTH_OFF                                       0
+#define     PSCDEPTH_ON                                        1
+#define     PSCDEPTH_ON_GE                                     2
+#define     PSCDEPTH_ON_LE                                     3
+   uint32_t                                     PixelShaderComputedDepthMode;
+#define     EDSC_NORMAL                                        0
+#define     EDSC_PSEXEC                                        1
+#define     EDSC_PREPS                                         2
+   uint32_t                                     EarlyDepthStencilControl;
+   bool                                         PixelShaderUsesSourceDepth;
+   bool                                         PixelShaderUsesSourceW;
+#define     INTERP_PIXEL                                       0
+#define     INTERP_CENTROID                                    2
+#define     INTERP_SAMPLE                                      3
+   uint32_t                                     PositionZWInterpolationMode;
+   uint32_t                                     BarycentricInterpolationMode;
+   bool                                         PixelShaderUsesInputCoverageMask;
+   uint32_t                                     LineEndCapAntialiasingRegionWidth;
+   uint32_t                                     LineAntialiasingRegionWidth;
+   bool                                         RTIndependentRasterizationEnable;
+   bool                                         PolygonStippleEnable;
+   bool                                         LineStippleEnable;
+#define     RASTRULE_UPPER_LEFT                                0
+#define     RASTRULE_UPPER_RIGHT                               1
+   uint32_t                                     PointRasterizationRule;
+#define     MSRASTMODE_OFF_PIXEL                               0
+#define     MSRASTMODE_OFF_PATTERN                             1
+#define     MSRASTMODE_ON_PIXEL                                2
+#define     MSRASTMODE_ON_PATTERN                              3
+   uint32_t                                     MultisampleRasterizationMode;
+#define     MSDISPMODE_PERSAMPLE                               0
+#define     MSDISPMODE_PERPIXEL                                1
+   uint32_t                                     MultisampleDispatchMode;
+#define     OFF                                                0
+#define     ON                                                 1
+   uint32_t                                     PSUAVonly;
+};
+
+static inline void
+GEN75_3DSTATE_WM_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_3DSTATE_WM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StatisticsEnable, 31, 31) |
+      __gen_field(values->DepthBufferClear, 30, 30) |
+      __gen_field(values->ThreadDispatchEnable, 29, 29) |
+      __gen_field(values->DepthBufferResolveEnable, 28, 28) |
+      __gen_field(values->HierarchicalDepthBufferResolveEnable, 27, 27) |
+      __gen_field(values->LegacyDiamondLineRasterization, 26, 26) |
+      __gen_field(values->PixelShaderKillPixel, 25, 25) |
+      __gen_field(values->PixelShaderComputedDepthMode, 23, 24) |
+      __gen_field(values->EarlyDepthStencilControl, 21, 22) |
+      __gen_field(values->PixelShaderUsesSourceDepth, 20, 20) |
+      __gen_field(values->PixelShaderUsesSourceW, 19, 19) |
+      __gen_field(values->PositionZWInterpolationMode, 17, 18) |
+      __gen_field(values->BarycentricInterpolationMode, 11, 16) |
+      __gen_field(values->PixelShaderUsesInputCoverageMask, 10, 10) |
+      __gen_field(values->LineEndCapAntialiasingRegionWidth, 8, 9) |
+      __gen_field(values->LineAntialiasingRegionWidth, 6, 7) |
+      __gen_field(values->RTIndependentRasterizationEnable, 5, 5) |
+      __gen_field(values->PolygonStippleEnable, 4, 4) |
+      __gen_field(values->LineStippleEnable, 3, 3) |
+      __gen_field(values->PointRasterizationRule, 2, 2) |
+      __gen_field(values->MultisampleRasterizationMode, 0, 1) |
+      0;
+
+   dw[2] =
+      __gen_field(values->MultisampleDispatchMode, 31, 31) |
+      __gen_field(values->PSUAVonly, 30, 30) |
+      0;
+
+}
+
+#define GEN75_GPGPU_OBJECT_length_bias 0x00000002
+#define GEN75_GPGPU_OBJECT_header               \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  4,                  \
+   .DwordLength          =  6
+
+#define GEN75_GPGPU_OBJECT_length 0x00000008
+
+struct GEN75_GPGPU_OBJECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SharedLocalMemoryFixedOffset;
+   uint32_t                                     InterfaceDescriptorOffset;
+   uint32_t                                     SharedLocalMemoryOffset;
+   uint32_t                                     EndofThreadGroup;
+#define     Slice0                                             0
+#define     Slice1                                             1
+   uint32_t                                     SliceDestinationSelect;
+#define     HalfSlice1                                         2
+#define     HalfSlice0                                         1
+#define     EitherHalfSlice                                    0
+   uint32_t                                     HalfSliceDestinationSelect;
+   uint32_t                                     IndirectDataLength;
+   uint32_t                                     IndirectDataStartAddress;
+   uint32_t                                     ThreadGroupIDX;
+   uint32_t                                     ThreadGroupIDY;
+   uint32_t                                     ThreadGroupIDZ;
+   uint32_t                                     ExecutionMask;
+};
+
+static inline void
+GEN75_GPGPU_OBJECT_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_GPGPU_OBJECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SharedLocalMemoryFixedOffset, 7, 7) |
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SharedLocalMemoryOffset, 28, 31) |
+      __gen_field(values->EndofThreadGroup, 24, 24) |
+      __gen_field(values->SliceDestinationSelect, 19, 19) |
+      __gen_field(values->HalfSliceDestinationSelect, 17, 18) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->IndirectDataStartAddress, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->ThreadGroupIDX, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ThreadGroupIDY, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ThreadGroupIDZ, 0, 31) |
+      0;
+
+   dw[7] =
+      __gen_field(values->ExecutionMask, 0, 31) |
+      0;
+
+}
+
+#define GEN75_GPGPU_WALKER_length_bias 0x00000002
+#define GEN75_GPGPU_WALKER_header               \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcodeA           =  5,                  \
+   .DwordLength          =  9
+
+#define GEN75_GPGPU_WALKER_length 0x0000000b
+
+struct GEN75_GPGPU_WALKER {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcodeA;
+   bool                                         IndirectParameterEnable;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+#define     SIMD8                                              0
+#define     SIMD16                                             1
+#define     SIMD32                                             2
+   uint32_t                                     SIMDSize;
+   uint32_t                                     ThreadDepthCounterMaximum;
+   uint32_t                                     ThreadHeightCounterMaximum;
+   uint32_t                                     ThreadWidthCounterMaximum;
+   uint32_t                                     ThreadGroupIDStartingX;
+   uint32_t                                     ThreadGroupIDXDimension;
+   uint32_t                                     ThreadGroupIDStartingY;
+   uint32_t                                     ThreadGroupIDYDimension;
+   uint32_t                                     ThreadGroupIDStartingZ;
+   uint32_t                                     ThreadGroupIDZDimension;
+   uint32_t                                     RightExecutionMask;
+   uint32_t                                     BottomExecutionMask;
+};
+
+static inline void
+GEN75_GPGPU_WALKER_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_GPGPU_WALKER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcodeA, 16, 23) |
+      __gen_field(values->IndirectParameterEnable, 10, 10) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SIMDSize, 30, 31) |
+      __gen_field(values->ThreadDepthCounterMaximum, 16, 21) |
+      __gen_field(values->ThreadHeightCounterMaximum, 8, 13) |
+      __gen_field(values->ThreadWidthCounterMaximum, 0, 5) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ThreadGroupIDStartingX, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->ThreadGroupIDXDimension, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ThreadGroupIDStartingY, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ThreadGroupIDYDimension, 0, 31) |
+      0;
+
+   dw[7] =
+      __gen_field(values->ThreadGroupIDStartingZ, 0, 31) |
+      0;
+
+   dw[8] =
+      __gen_field(values->ThreadGroupIDZDimension, 0, 31) |
+      0;
+
+   dw[9] =
+      __gen_field(values->RightExecutionMask, 0, 31) |
+      0;
+
+   dw[10] =
+      __gen_field(values->BottomExecutionMask, 0, 31) |
+      0;
+
+}
+
+#define GEN75_MEDIA_CURBE_LOAD_length_bias 0x00000002
+#define GEN75_MEDIA_CURBE_LOAD_header           \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  1,                  \
+   .DwordLength          =  2
+
+#define GEN75_MEDIA_CURBE_LOAD_length 0x00000004
+
+struct GEN75_MEDIA_CURBE_LOAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CURBETotalDataLength;
+   uint32_t                                     CURBEDataStartAddress;
+};
+
+static inline void
+GEN75_MEDIA_CURBE_LOAD_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN75_MEDIA_CURBE_LOAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->CURBETotalDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_field(values->CURBEDataStartAddress, 0, 31) |
+      0;
+
+}
+
+#define GEN75_MEDIA_INTERFACE_DESCRIPTOR_LOAD_length_bias 0x00000002
+#define GEN75_MEDIA_INTERFACE_DESCRIPTOR_LOAD_header\
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  2,                  \
+   .DwordLength          =  2
+
+#define GEN75_MEDIA_INTERFACE_DESCRIPTOR_LOAD_length 0x00000004
+
+struct GEN75_MEDIA_INTERFACE_DESCRIPTOR_LOAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorTotalLength;
+   uint32_t                                     InterfaceDescriptorDataStartAddress;
+};
+
+static inline void
+GEN75_MEDIA_INTERFACE_DESCRIPTOR_LOAD_pack(__gen_user_data *data, void * restrict dst,
+                                           const struct GEN75_MEDIA_INTERFACE_DESCRIPTOR_LOAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->InterfaceDescriptorTotalLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->InterfaceDescriptorDataStartAddress, 0, 31) |
+      0;
+
+}
+
+#define GEN75_MEDIA_OBJECT_length_bias 0x00000002
+#define GEN75_MEDIA_OBJECT_header               \
+   .CommandType          =  3,                  \
+   .MediaCommandPipeline =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .MediaCommandSubOpcode =  0
+
+#define GEN75_MEDIA_OBJECT_length 0x00000000
+
+struct GEN75_MEDIA_OBJECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MediaCommandPipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     MediaCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+#define     Nothreadsynchronization                            0
+#define     Threaddispatchissynchronizedbythespawnrootthreadmessage       1
+   uint32_t                                     ThreadSynchronization;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+#define     Slice0                                             0
+#define     Slice1                                             1
+#define     EitherSlice                                        0
+   uint32_t                                     SliceDestinationSelect;
+#define     HalfSlice1                                         2
+#define     HalfSlice0                                         1
+#define     Eitherhalfslice                                    0
+   uint32_t                                     HalfSliceDestinationSelect;
+   uint32_t                                     IndirectDataLength;
+   __gen_address_type                           IndirectDataStartAddress;
+   uint32_t                                     ScoredboardY;
+   uint32_t                                     ScoreboardX;
+   uint32_t                                     ScoreboardColor;
+   bool                                         ScoreboardMask;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_MEDIA_OBJECT_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_MEDIA_OBJECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MediaCommandPipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->MediaCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->ThreadSynchronization, 24, 24) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->SliceDestinationSelect, 19, 19) |
+      __gen_field(values->HalfSliceDestinationSelect, 17, 18) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->IndirectDataStartAddress, dw3);
+
+   dw[4] =
+      __gen_field(values->ScoredboardY, 16, 24) |
+      __gen_field(values->ScoreboardX, 0, 8) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardColor, 16, 19) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_MEDIA_OBJECT_PRT_length_bias 0x00000002
+#define GEN75_MEDIA_OBJECT_PRT_header           \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  2,                  \
+   .DwordLength          = 14
+
+#define GEN75_MEDIA_OBJECT_PRT_length 0x00000010
+
+struct GEN75_MEDIA_OBJECT_PRT {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+   bool                                         PRT_FenceNeeded;
+#define     Rootthreadqueue                                    0
+#define     VFEstateflush                                      1
+   uint32_t                                     PRT_FenceType;
+   uint32_t                                     InlineData[12];
+};
+
+static inline void
+GEN75_MEDIA_OBJECT_PRT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN75_MEDIA_OBJECT_PRT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->PRT_FenceNeeded, 23, 23) |
+      __gen_field(values->PRT_FenceType, 22, 22) |
+      0;
+
+   dw[3] =
+      0;
+
+   for (uint32_t i = 0, j = 4; i < 12; i += 1, j++) {
+      dw[j] =
+         __gen_field(values->InlineData[i + 0], 0, 31) |
+         0;
+   }
+
+}
+
+#define GEN75_MEDIA_OBJECT_WALKER_length_bias 0x00000002
+#define GEN75_MEDIA_OBJECT_WALKER_header        \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  3
+
+#define GEN75_MEDIA_OBJECT_WALKER_length 0x00000000
+
+struct GEN75_MEDIA_OBJECT_WALKER {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+#define     Nothreadsynchronization                            0
+#define     Threaddispatchissynchronizedbythespawnrootthreadmessage       1
+   uint32_t                                     ThreadSynchronization;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+   uint32_t                                     IndirectDataLength;
+   uint32_t                                     IndirectDataStartAddress;
+   bool                                         ScoreboardMask;
+   bool                                         DualMode;
+   bool                                         Repel;
+   bool                                         QuadMode;
+   uint32_t                                     ColorCountMinusOne;
+   uint32_t                                     MiddleLoopExtraSteps;
+   uint32_t                                     LocalMidLoopUnitY;
+   uint32_t                                     MidLoopUnitX;
+   uint32_t                                     GlobalLoopExecCount;
+   uint32_t                                     LocalLoopExecCount;
+   uint32_t                                     BlockResolutionY;
+   uint32_t                                     BlockResolutionX;
+   uint32_t                                     LocalStartY;
+   uint32_t                                     LocalStartX;
+   uint32_t                                     LocalOuterLoopStrideY;
+   uint32_t                                     LocalOuterLoopStrideX;
+   uint32_t                                     LocalInnerLoopUnitY;
+   uint32_t                                     LocalInnerLoopUnitX;
+   uint32_t                                     GlobalResolutionY;
+   uint32_t                                     GlobalResolutionX;
+   uint32_t                                     GlobalStartY;
+   uint32_t                                     GlobalStartX;
+   uint32_t                                     GlobalOuterLoopStrideY;
+   uint32_t                                     GlobalOuterLoopStrideX;
+   uint32_t                                     GlobalInnerLoopUnitY;
+   uint32_t                                     GlobalInnerLoopUnitX;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_MEDIA_OBJECT_WALKER_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_MEDIA_OBJECT_WALKER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->ThreadSynchronization, 24, 24) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->IndirectDataStartAddress, 0, 31) |
+      0;
+
+   dw[4] =
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->DualMode, 31, 31) |
+      __gen_field(values->Repel, 30, 30) |
+      __gen_field(values->QuadMode, 29, 29) |
+      __gen_field(values->ColorCountMinusOne, 24, 27) |
+      __gen_field(values->MiddleLoopExtraSteps, 16, 20) |
+      __gen_field(values->LocalMidLoopUnitY, 12, 13) |
+      __gen_field(values->MidLoopUnitX, 8, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->GlobalLoopExecCount, 16, 25) |
+      __gen_field(values->LocalLoopExecCount, 0, 9) |
+      0;
+
+   dw[8] =
+      __gen_field(values->BlockResolutionY, 16, 24) |
+      __gen_field(values->BlockResolutionX, 0, 8) |
+      0;
+
+   dw[9] =
+      __gen_field(values->LocalStartY, 16, 24) |
+      __gen_field(values->LocalStartX, 0, 8) |
+      0;
+
+   dw[10] =
+      0;
+
+   dw[11] =
+      __gen_field(values->LocalOuterLoopStrideY, 16, 25) |
+      __gen_field(values->LocalOuterLoopStrideX, 0, 9) |
+      0;
+
+   dw[12] =
+      __gen_field(values->LocalInnerLoopUnitY, 16, 25) |
+      __gen_field(values->LocalInnerLoopUnitX, 0, 9) |
+      0;
+
+   dw[13] =
+      __gen_field(values->GlobalResolutionY, 16, 24) |
+      __gen_field(values->GlobalResolutionX, 0, 8) |
+      0;
+
+   dw[14] =
+      __gen_field(values->GlobalStartY, 16, 25) |
+      __gen_field(values->GlobalStartX, 0, 9) |
+      0;
+
+   dw[15] =
+      __gen_field(values->GlobalOuterLoopStrideY, 16, 25) |
+      __gen_field(values->GlobalOuterLoopStrideX, 0, 9) |
+      0;
+
+   dw[16] =
+      __gen_field(values->GlobalInnerLoopUnitY, 16, 25) |
+      __gen_field(values->GlobalInnerLoopUnitX, 0, 9) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_MEDIA_STATE_FLUSH_length_bias 0x00000002
+#define GEN75_MEDIA_STATE_FLUSH_header          \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  4,                  \
+   .DwordLength          =  0
+
+#define GEN75_MEDIA_STATE_FLUSH_length 0x00000002
+
+struct GEN75_MEDIA_STATE_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         DisablePreemption;
+   bool                                         FlushtoGO;
+   uint32_t                                     WatermarkRequired;
+   uint32_t                                     InterfaceDescriptorOffset;
+};
+
+static inline void
+GEN75_MEDIA_STATE_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN75_MEDIA_STATE_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DisablePreemption, 8, 8) |
+      __gen_field(values->FlushtoGO, 7, 7) |
+      __gen_field(values->WatermarkRequired, 6, 6) |
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+}
+
+#define GEN75_MEDIA_VFE_STATE_length_bias 0x00000002
+#define GEN75_MEDIA_VFE_STATE_header            \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  0,                  \
+   .DwordLength          =  6
+
+#define GEN75_MEDIA_VFE_STATE_length 0x00000008
+
+struct GEN75_MEDIA_VFE_STATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     StackSize;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     NumberofURBEntries;
+#define     Maintainingtheexistingtimestampstate               0
+#define     Resettingrelativetimerandlatchingtheglobaltimestamp       1
+   uint32_t                                     ResetGatewayTimer;
+#define     MaintainingOpenGatewayForwardMsgCloseGatewayprotocollegacymode       0
+#define     BypassingOpenGatewayCloseGatewayprotocol           1
+   uint32_t                                     BypassGatewayControl;
+   uint32_t                                     GPGPUMode;
+   uint32_t                                     HalfSliceDisable;
+   uint32_t                                     URBEntryAllocationSize;
+   uint32_t                                     CURBEAllocationSize;
+#define     Scoreboarddisabled                                 0
+#define     Scoreboardenabled                                  1
+   uint32_t                                     ScoreboardEnable;
+#define     StallingScoreboard                                 0
+#define     NonStallingScoreboard                              1
+   uint32_t                                     ScoreboardType;
+   uint32_t                                     ScoreboardMask;
+   uint32_t                                     Scoreboard3DeltaY;
+   uint32_t                                     Scoreboard3DeltaX;
+   uint32_t                                     Scoreboard2DeltaY;
+   uint32_t                                     Scoreboard2DeltaX;
+   uint32_t                                     Scoreboard1DeltaY;
+   uint32_t                                     Scoreboard1DeltaX;
+   uint32_t                                     Scoreboard0DeltaY;
+   uint32_t                                     Scoreboard0DeltaX;
+   uint32_t                                     Scoreboard7DeltaY;
+   uint32_t                                     Scoreboard7DeltaX;
+   uint32_t                                     Scoreboard6DeltaY;
+   uint32_t                                     Scoreboard6DeltaX;
+   uint32_t                                     Scoreboard5DeltaY;
+   uint32_t                                     Scoreboard5DeltaX;
+   uint32_t                                     Scoreboard4DeltaY;
+   uint32_t                                     Scoreboard4DeltaX;
+};
+
+static inline void
+GEN75_MEDIA_VFE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN75_MEDIA_VFE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->StackSize, 4, 7) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->MaximumNumberofThreads, 16, 31) |
+      __gen_field(values->NumberofURBEntries, 8, 15) |
+      __gen_field(values->ResetGatewayTimer, 7, 7) |
+      __gen_field(values->BypassGatewayControl, 6, 6) |
+      __gen_field(values->GPGPUMode, 2, 2) |
+      0;
+
+   dw[3] =
+      __gen_field(values->HalfSliceDisable, 0, 1) |
+      0;
+
+   dw[4] =
+      __gen_field(values->URBEntryAllocationSize, 16, 31) |
+      __gen_field(values->CURBEAllocationSize, 0, 15) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardEnable, 31, 31) |
+      __gen_field(values->ScoreboardType, 30, 30) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->Scoreboard3DeltaY, 28, 31) |
+      __gen_field(values->Scoreboard3DeltaX, 24, 27) |
+      __gen_field(values->Scoreboard2DeltaY, 20, 23) |
+      __gen_field(values->Scoreboard2DeltaX, 16, 19) |
+      __gen_field(values->Scoreboard1DeltaY, 12, 15) |
+      __gen_field(values->Scoreboard1DeltaX, 8, 11) |
+      __gen_field(values->Scoreboard0DeltaY, 4, 7) |
+      __gen_field(values->Scoreboard0DeltaX, 0, 3) |
+      0;
+
+   dw[7] =
+      __gen_field(values->Scoreboard7DeltaY, 28, 31) |
+      __gen_field(values->Scoreboard7DeltaX, 24, 27) |
+      __gen_field(values->Scoreboard6DeltaY, 20, 23) |
+      __gen_field(values->Scoreboard6DeltaX, 16, 19) |
+      __gen_field(values->Scoreboard5DeltaY, 12, 15) |
+      __gen_field(values->Scoreboard5DeltaX, 8, 11) |
+      __gen_field(values->Scoreboard4DeltaY, 4, 7) |
+      __gen_field(values->Scoreboard4DeltaX, 0, 3) |
+      0;
+
+}
+
+#define GEN75_MI_ARB_CHECK_length_bias 0x00000001
+#define GEN75_MI_ARB_CHECK_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  5
+
+#define GEN75_MI_ARB_CHECK_length 0x00000001
+
+struct GEN75_MI_ARB_CHECK {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN75_MI_ARB_CHECK_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_MI_ARB_CHECK * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN75_MI_ARB_ON_OFF_length_bias 0x00000001
+#define GEN75_MI_ARB_ON_OFF_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  8
+
+#define GEN75_MI_ARB_ON_OFF_length 0x00000001
+
+struct GEN75_MI_ARB_ON_OFF {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         ArbitrationEnable;
+};
+
+static inline void
+GEN75_MI_ARB_ON_OFF_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN75_MI_ARB_ON_OFF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ArbitrationEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN75_MI_BATCH_BUFFER_END_length_bias 0x00000001
+#define GEN75_MI_BATCH_BUFFER_END_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 10
+
+#define GEN75_MI_BATCH_BUFFER_END_length 0x00000001
+
+struct GEN75_MI_BATCH_BUFFER_END {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN75_MI_BATCH_BUFFER_END_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_MI_BATCH_BUFFER_END * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN75_MI_BATCH_BUFFER_START_length_bias 0x00000002
+#define GEN75_MI_BATCH_BUFFER_START_header      \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 49,                  \
+   .DwordLength          =  0
+
+#define GEN75_MI_BATCH_BUFFER_START_length 0x00000002
+
+struct GEN75_MI_BATCH_BUFFER_START {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     _1stlevelbatch                                     0
+#define     _2ndlevelbatch                                     1
+   uint32_t                                     _2ndLevelBatchBuffer;
+   bool                                         AddOffsetEnable;
+   bool                                         PredicationEnable;
+   uint32_t                                     NonPrivileged;
+   bool                                         ClearCommandBufferEnable;
+   bool                                         ResourceStreamerEnable;
+#define     ASI_GGTT                                           0
+#define     ASI_PPGTT                                          1
+   uint32_t                                     AddressSpaceIndicator;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BatchBufferStartAddress;
+};
+
+static inline void
+GEN75_MI_BATCH_BUFFER_START_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN75_MI_BATCH_BUFFER_START * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->_2ndLevelBatchBuffer, 22, 22) |
+      __gen_field(values->AddOffsetEnable, 16, 16) |
+      __gen_field(values->PredicationEnable, 15, 15) |
+      __gen_field(values->NonPrivileged, 13, 13) |
+      __gen_field(values->ClearCommandBufferEnable, 11, 11) |
+      __gen_field(values->ResourceStreamerEnable, 10, 10) |
+      __gen_field(values->AddressSpaceIndicator, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->BatchBufferStartAddress, dw1);
+
+}
+
+#define GEN75_MI_CLFLUSH_length_bias 0x00000002
+#define GEN75_MI_CLFLUSH_header                 \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 39
+
+#define GEN75_MI_CLFLUSH_length 0x00000000
+
+struct GEN75_MI_CLFLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           PageBaseAddress;
+   uint32_t                                     StartingCachelineOffset;
+   __gen_address_type                           PageBaseAddressHigh;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_MI_CLFLUSH_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN75_MI_CLFLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 9) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->StartingCachelineOffset, 6, 11) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->PageBaseAddress, dw1);
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->PageBaseAddressHigh, dw2);
+
+   /* variable length fields follow */
+}
+
+#define GEN75_MI_CONDITIONAL_BATCH_BUFFER_END_length_bias 0x00000002
+#define GEN75_MI_CONDITIONAL_BATCH_BUFFER_END_header\
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 54,                  \
+   .UseGlobalGTT         =  0,                  \
+   .CompareSemaphore     =  0,                  \
+   .DwordLength          =  0
+
+#define GEN75_MI_CONDITIONAL_BATCH_BUFFER_END_length 0x00000002
+
+struct GEN75_MI_CONDITIONAL_BATCH_BUFFER_END {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     CompareSemaphore;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CompareDataDword;
+   __gen_address_type                           CompareAddress;
+};
+
+static inline void
+GEN75_MI_CONDITIONAL_BATCH_BUFFER_END_pack(__gen_user_data *data, void * restrict dst,
+                                           const struct GEN75_MI_CONDITIONAL_BATCH_BUFFER_END * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->CompareSemaphore, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->CompareDataDword, 0, 31) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->CompareAddress, dw2);
+
+}
+
+#define GEN75_MI_FLUSH_length_bias 0x00000001
+#define GEN75_MI_FLUSH_header                   \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  4
+
+#define GEN75_MI_FLUSH_length 0x00000001
+
+struct GEN75_MI_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         IndirectStatePointersDisable;
+   bool                                         GenericMediaStateClear;
+#define     DontReset                                          0
+#define     Reset                                              1
+   bool                                         GlobalSnapshotCountReset;
+#define     Flush                                              0
+#define     DontFlush                                          1
+   bool                                         RenderCacheFlushInhibit;
+#define     DontInvalidate                                     0
+#define     Invalidate                                         1
+   bool                                         StateInstructionCacheInvalidate;
+};
+
+static inline void
+GEN75_MI_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                    const struct GEN75_MI_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->IndirectStatePointersDisable, 5, 5) |
+      __gen_field(values->GenericMediaStateClear, 4, 4) |
+      __gen_field(values->GlobalSnapshotCountReset, 3, 3) |
+      __gen_field(values->RenderCacheFlushInhibit, 2, 2) |
+      __gen_field(values->StateInstructionCacheInvalidate, 1, 1) |
+      0;
+
+}
+
+#define GEN75_MI_LOAD_REGISTER_IMM_length_bias 0x00000002
+#define GEN75_MI_LOAD_REGISTER_IMM_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 34,                  \
+   .DwordLength          =  1
+
+#define GEN75_MI_LOAD_REGISTER_IMM_length 0x00000003
+
+struct GEN75_MI_LOAD_REGISTER_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     ByteWriteDisables;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterOffset;
+   uint32_t                                     DataDWord;
+};
+
+static inline void
+GEN75_MI_LOAD_REGISTER_IMM_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_MI_LOAD_REGISTER_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ByteWriteDisables, 8, 11) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterOffset, 2, 22) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DataDWord, 0, 31) |
+      0;
+
+}
+
+#define GEN75_MI_LOAD_REGISTER_MEM_length_bias 0x00000002
+#define GEN75_MI_LOAD_REGISTER_MEM_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 41,                  \
+   .DwordLength          =  1
+
+#define GEN75_MI_LOAD_REGISTER_MEM_length 0x00000003
+
+struct GEN75_MI_LOAD_REGISTER_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     AsyncModeEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN75_MI_LOAD_REGISTER_MEM_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_MI_LOAD_REGISTER_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->AsyncModeEnable, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterAddress, 2, 22) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+}
+
+#define GEN75_MI_LOAD_REGISTER_REG_length_bias 0x00000002
+#define GEN75_MI_LOAD_REGISTER_REG_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 42,                  \
+   .DwordLength          =  1
+
+#define GEN75_MI_LOAD_REGISTER_REG_length 0x00000003
+
+struct GEN75_MI_LOAD_REGISTER_REG {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SourceRegisterAddress;
+   uint32_t                                     DestinationRegisterAddress;
+};
+
+static inline void
+GEN75_MI_LOAD_REGISTER_REG_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_MI_LOAD_REGISTER_REG * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SourceRegisterAddress, 2, 22) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->DestinationRegisterAddress, 2, 22) |
+      0;
+
+}
+
+#define GEN75_MI_LOAD_SCAN_LINES_EXCL_length_bias 0x00000002
+#define GEN75_MI_LOAD_SCAN_LINES_EXCL_header    \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 19,                  \
+   .DwordLength          =  0
+
+#define GEN75_MI_LOAD_SCAN_LINES_EXCL_length 0x00000002
+
+struct GEN75_MI_LOAD_SCAN_LINES_EXCL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     DisplayPlaneA                                      0
+#define     DisplayPlaneB                                      1
+#define     DisplayPlaneC                                      4
+   uint32_t                                     DisplayPlaneSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StartScanLineNumber;
+   uint32_t                                     EndScanLineNumber;
+};
+
+static inline void
+GEN75_MI_LOAD_SCAN_LINES_EXCL_pack(__gen_user_data *data, void * restrict dst,
+                                   const struct GEN75_MI_LOAD_SCAN_LINES_EXCL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPlaneSelect, 19, 21) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StartScanLineNumber, 16, 28) |
+      __gen_field(values->EndScanLineNumber, 0, 12) |
+      0;
+
+}
+
+#define GEN75_MI_LOAD_SCAN_LINES_INCL_length_bias 0x00000002
+#define GEN75_MI_LOAD_SCAN_LINES_INCL_header    \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 18,                  \
+   .DwordLength          =  0
+
+#define GEN75_MI_LOAD_SCAN_LINES_INCL_length 0x00000002
+
+struct GEN75_MI_LOAD_SCAN_LINES_INCL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     DisplayPlaneA                                      0
+#define     DisplayPlaneB                                      1
+#define     DisplayPlaneC                                      4
+   uint32_t                                     DisplayPlaneSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StartScanLineNumber;
+   uint32_t                                     EndScanLineNumber;
+};
+
+static inline void
+GEN75_MI_LOAD_SCAN_LINES_INCL_pack(__gen_user_data *data, void * restrict dst,
+                                   const struct GEN75_MI_LOAD_SCAN_LINES_INCL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPlaneSelect, 19, 21) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StartScanLineNumber, 16, 28) |
+      __gen_field(values->EndScanLineNumber, 0, 12) |
+      0;
+
+}
+
+#define GEN75_MI_LOAD_URB_MEM_length_bias 0x00000002
+#define GEN75_MI_LOAD_URB_MEM_header            \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 44,                  \
+   .DwordLength          =  1
+
+#define GEN75_MI_LOAD_URB_MEM_length 0x00000003
+
+struct GEN75_MI_LOAD_URB_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN75_MI_LOAD_URB_MEM_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN75_MI_LOAD_URB_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBAddress, 2, 14) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+}
+
+#define GEN75_MI_MATH_length_bias 0x00000002
+#define GEN75_MI_MATH_header                    \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 26
+
+#define GEN75_MI_MATH_length 0x00000000
+
+struct GEN75_MI_MATH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ALUINSTRUCTION1;
+   uint32_t                                     ALUINSTRUCTION2;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_MI_MATH_pack(__gen_user_data *data, void * restrict dst,
+                   const struct GEN75_MI_MATH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ALUINSTRUCTION1, 0, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ALUINSTRUCTION2, 0, 31) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN75_MI_NOOP_length_bias 0x00000001
+#define GEN75_MI_NOOP_header                    \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  0
+
+#define GEN75_MI_NOOP_length 0x00000001
+
+struct GEN75_MI_NOOP {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         IdentificationNumberRegisterWriteEnable;
+   uint32_t                                     IdentificationNumber;
+};
+
+static inline void
+GEN75_MI_NOOP_pack(__gen_user_data *data, void * restrict dst,
+                   const struct GEN75_MI_NOOP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->IdentificationNumberRegisterWriteEnable, 22, 22) |
+      __gen_field(values->IdentificationNumber, 0, 21) |
+      0;
+
+}
+
+#define GEN75_MI_PREDICATE_length_bias 0x00000001
+#define GEN75_MI_PREDICATE_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 12
+
+#define GEN75_MI_PREDICATE_length 0x00000001
+
+struct GEN75_MI_PREDICATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     LOAD_KEEP                                          0
+#define     LOAD_LOAD                                          2
+#define     LOAD_LOADINV                                       3
+   uint32_t                                     LoadOperation;
+#define     COMBINE_SET                                        0
+#define     COMBINE_AND                                        1
+#define     COMBINE_OR                                         2
+#define     COMBINE_XOR                                        3
+   uint32_t                                     CombineOperation;
+#define     COMPARE_SRCS_EQUAL                                 2
+#define     COMPARE_DELTAS_EQUAL                               3
+   uint32_t                                     CompareOperation;
+};
+
+static inline void
+GEN75_MI_PREDICATE_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_MI_PREDICATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->LoadOperation, 6, 7) |
+      __gen_field(values->CombineOperation, 3, 4) |
+      __gen_field(values->CompareOperation, 0, 1) |
+      0;
+
+}
+
+#define GEN75_MI_REPORT_HEAD_length_bias 0x00000001
+#define GEN75_MI_REPORT_HEAD_header             \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  7
+
+#define GEN75_MI_REPORT_HEAD_length 0x00000001
+
+struct GEN75_MI_REPORT_HEAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN75_MI_REPORT_HEAD_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN75_MI_REPORT_HEAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN75_MI_RS_CONTEXT_length_bias 0x00000001
+#define GEN75_MI_RS_CONTEXT_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 15
+
+#define GEN75_MI_RS_CONTEXT_length 0x00000001
+
+struct GEN75_MI_RS_CONTEXT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     RS_RESTORE                                         0
+#define     RS_SAVE                                            1
+   uint32_t                                     ResourceStreamerSave;
+};
+
+static inline void
+GEN75_MI_RS_CONTEXT_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN75_MI_RS_CONTEXT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ResourceStreamerSave, 0, 0) |
+      0;
+
+}
+
+#define GEN75_MI_RS_CONTROL_length_bias 0x00000001
+#define GEN75_MI_RS_CONTROL_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  6
+
+#define GEN75_MI_RS_CONTROL_length 0x00000001
+
+struct GEN75_MI_RS_CONTROL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     RS_STOP                                            0
+#define     RS_START                                           1
+   uint32_t                                     ResourceStreamerControl;
+};
+
+static inline void
+GEN75_MI_RS_CONTROL_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN75_MI_RS_CONTROL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ResourceStreamerControl, 0, 0) |
+      0;
+
+}
+
+#define GEN75_MI_RS_STORE_DATA_IMM_length_bias 0x00000002
+#define GEN75_MI_RS_STORE_DATA_IMM_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 43,                  \
+   .DwordLength          =  2
+
+#define GEN75_MI_RS_STORE_DATA_IMM_length 0x00000004
+
+struct GEN75_MI_RS_STORE_DATA_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           DestinationAddress;
+   uint32_t                                     CoreModeEnable;
+   uint32_t                                     DataDWord0;
+};
+
+static inline void
+GEN75_MI_RS_STORE_DATA_IMM_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_MI_RS_STORE_DATA_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      0;
+
+   uint32_t dw2 =
+      __gen_field(values->CoreModeEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->DestinationAddress, dw2);
+
+   dw[3] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+}
+
+#define GEN75_MI_SEMAPHORE_MBOX_length_bias 0x00000002
+#define GEN75_MI_SEMAPHORE_MBOX_header          \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 22,                  \
+   .DwordLength          =  1
+
+#define GEN75_MI_SEMAPHORE_MBOX_length 0x00000003
+
+struct GEN75_MI_SEMAPHORE_MBOX {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     RVSYNC                                             0
+#define     RVESYNC                                            1
+#define     RBSYNC                                             2
+#define     UseGeneralRegisterSelect                           3
+   uint32_t                                     RegisterSelect;
+   uint32_t                                     GeneralRegisterSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SemaphoreDataDword;
+};
+
+static inline void
+GEN75_MI_SEMAPHORE_MBOX_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN75_MI_SEMAPHORE_MBOX * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->RegisterSelect, 16, 17) |
+      __gen_field(values->GeneralRegisterSelect, 8, 13) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SemaphoreDataDword, 0, 31) |
+      0;
+
+   dw[2] =
+      0;
+
+}
+
+#define GEN75_MI_SET_CONTEXT_length_bias 0x00000002
+#define GEN75_MI_SET_CONTEXT_header             \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 24,                  \
+   .DwordLength          =  0
+
+#define GEN75_MI_SET_CONTEXT_length 0x00000002
+
+struct GEN75_MI_SET_CONTEXT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           LogicalContextAddress;
+   uint32_t                                     ReservedMustbe1;
+   bool                                         CoreModeEnable;
+   bool                                         ResourceStreamerStateSaveEnable;
+   bool                                         ResourceStreamerStateRestoreEnable;
+   uint32_t                                     ForceRestore;
+   uint32_t                                     RestoreInhibit;
+};
+
+static inline void
+GEN75_MI_SET_CONTEXT_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN75_MI_SET_CONTEXT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->ReservedMustbe1, 8, 8) |
+      __gen_field(values->CoreModeEnable, 4, 4) |
+      __gen_field(values->ResourceStreamerStateSaveEnable, 3, 3) |
+      __gen_field(values->ResourceStreamerStateRestoreEnable, 2, 2) |
+      __gen_field(values->ForceRestore, 1, 1) |
+      __gen_field(values->RestoreInhibit, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->LogicalContextAddress, dw1);
+
+}
+
+#define GEN75_MI_SET_PREDICATE_length_bias 0x00000001
+#define GEN75_MI_SET_PREDICATE_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  1,                  \
+   .PREDICATEENABLE      =  6
+
+#define GEN75_MI_SET_PREDICATE_length 0x00000001
+
+struct GEN75_MI_SET_PREDICATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PredicateAlways                                    0
+#define     PredicateonClear                                   1
+#define     PredicateonSet                                     2
+#define     PredicateDisable                                   3
+   bool                                         PREDICATEENABLE;
+};
+
+static inline void
+GEN75_MI_SET_PREDICATE_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN75_MI_SET_PREDICATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->PREDICATEENABLE, 0, 1) |
+      0;
+
+}
+
+#define GEN75_MI_STORE_DATA_IMM_length_bias 0x00000002
+#define GEN75_MI_STORE_DATA_IMM_header          \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 32,                  \
+   .DwordLength          =  2
+
+#define GEN75_MI_STORE_DATA_IMM_length 0x00000004
+
+struct GEN75_MI_STORE_DATA_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   uint32_t                                     Address;
+   uint32_t                                     CoreModeEnable;
+   uint32_t                                     DataDWord0;
+   uint32_t                                     DataDWord1;
+};
+
+static inline void
+GEN75_MI_STORE_DATA_IMM_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN75_MI_STORE_DATA_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->Address, 2, 31) |
+      __gen_field(values->CoreModeEnable, 0, 0) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->DataDWord1, 0, 31) |
+      0;
+
+}
+
+#define GEN75_MI_STORE_DATA_INDEX_length_bias 0x00000002
+#define GEN75_MI_STORE_DATA_INDEX_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 33,                  \
+   .DwordLength          =  1
+
+#define GEN75_MI_STORE_DATA_INDEX_length 0x00000003
+
+struct GEN75_MI_STORE_DATA_INDEX {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     Offset;
+   uint32_t                                     DataDWord0;
+   uint32_t                                     DataDWord1;
+};
+
+static inline void
+GEN75_MI_STORE_DATA_INDEX_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_MI_STORE_DATA_INDEX * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Offset, 2, 11) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DataDWord1, 0, 31) |
+      0;
+
+}
+
+#define GEN75_MI_STORE_URB_MEM_length_bias 0x00000002
+#define GEN75_MI_STORE_URB_MEM_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 45,                  \
+   .DwordLength          =  1
+
+#define GEN75_MI_STORE_URB_MEM_length 0x00000003
+
+struct GEN75_MI_STORE_URB_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN75_MI_STORE_URB_MEM_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN75_MI_STORE_URB_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBAddress, 2, 14) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+}
+
+#define GEN75_MI_SUSPEND_FLUSH_length_bias 0x00000001
+#define GEN75_MI_SUSPEND_FLUSH_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 11
+
+#define GEN75_MI_SUSPEND_FLUSH_length 0x00000001
+
+struct GEN75_MI_SUSPEND_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         SuspendFlush;
+};
+
+static inline void
+GEN75_MI_SUSPEND_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN75_MI_SUSPEND_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->SuspendFlush, 0, 0) |
+      0;
+
+}
+
+#define GEN75_MI_TOPOLOGY_FILTER_length_bias 0x00000001
+#define GEN75_MI_TOPOLOGY_FILTER_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 13
+
+#define GEN75_MI_TOPOLOGY_FILTER_length 0x00000001
+
+struct GEN75_MI_TOPOLOGY_FILTER {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     TopologyFilterValue;
+};
+
+static inline void
+GEN75_MI_TOPOLOGY_FILTER_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN75_MI_TOPOLOGY_FILTER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->TopologyFilterValue, 0, 5) |
+      0;
+
+}
+
+#define GEN75_MI_UPDATE_GTT_length_bias 0x00000002
+#define GEN75_MI_UPDATE_GTT_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 35
+
+#define GEN75_MI_UPDATE_GTT_length 0x00000000
+
+struct GEN75_MI_UPDATE_GTT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           EntryAddress;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN75_MI_UPDATE_GTT_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN75_MI_UPDATE_GTT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->EntryAddress, dw1);
+
+   /* variable length fields follow */
+}
+
+#define GEN75_MI_URB_ATOMIC_ALLOC_length_bias 0x00000001
+#define GEN75_MI_URB_ATOMIC_ALLOC_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  9
+
+#define GEN75_MI_URB_ATOMIC_ALLOC_length 0x00000001
+
+struct GEN75_MI_URB_ATOMIC_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     URBAtomicStorageOffset;
+   uint32_t                                     URBAtomicStorageSize;
+};
+
+static inline void
+GEN75_MI_URB_ATOMIC_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_MI_URB_ATOMIC_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->URBAtomicStorageOffset, 12, 19) |
+      __gen_field(values->URBAtomicStorageSize, 0, 8) |
+      0;
+
+}
+
+#define GEN75_MI_URB_CLEAR_length_bias 0x00000002
+#define GEN75_MI_URB_CLEAR_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 25,                  \
+   .DwordLength          =  0
+
+#define GEN75_MI_URB_CLEAR_length 0x00000002
+
+struct GEN75_MI_URB_CLEAR {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBClearLength;
+   uint32_t                                     URBAddress;
+};
+
+static inline void
+GEN75_MI_URB_CLEAR_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_MI_URB_CLEAR * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBClearLength, 16, 29) |
+      __gen_offset(values->URBAddress, 0, 14) |
+      0;
+
+}
+
+#define GEN75_MI_USER_INTERRUPT_length_bias 0x00000001
+#define GEN75_MI_USER_INTERRUPT_header          \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  2
+
+#define GEN75_MI_USER_INTERRUPT_length 0x00000001
+
+struct GEN75_MI_USER_INTERRUPT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN75_MI_USER_INTERRUPT_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN75_MI_USER_INTERRUPT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN75_MI_WAIT_FOR_EVENT_length_bias 0x00000001
+#define GEN75_MI_WAIT_FOR_EVENT_header          \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  3
+
+#define GEN75_MI_WAIT_FOR_EVENT_length 0x00000001
+
+struct GEN75_MI_WAIT_FOR_EVENT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         DisplayPipeCHorizontalBlankWaitEnable;
+   bool                                         DisplayPipeCVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteCFlipPendingWaitEnable;
+#define     Notenabled                                         0
+   uint32_t                                     ConditionCodeWaitSelect;
+   bool                                         DisplayPlaneCFlipPendingWaitEnable;
+   bool                                         DisplayPipeCScanLineWaitEnable;
+   bool                                         DisplayPipeBHorizontalBlankWaitEnable;
+   bool                                         DisplayPipeBVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteBFlipPendingWaitEnable;
+   bool                                         DisplayPlaneBFlipPendingWaitEnable;
+   bool                                         DisplayPipeBScanLineWaitEnable;
+   bool                                         DisplayPipeAHorizontalBlankWaitEnable;
+   bool                                         DisplayPipeAVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteAFlipPendingWaitEnable;
+   bool                                         DisplayPlaneAFlipPendingWaitEnable;
+   bool                                         DisplayPipeAScanLineWaitEnable;
+};
+
+static inline void
+GEN75_MI_WAIT_FOR_EVENT_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN75_MI_WAIT_FOR_EVENT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPipeCHorizontalBlankWaitEnable, 22, 22) |
+      __gen_field(values->DisplayPipeCVerticalBlankWaitEnable, 21, 21) |
+      __gen_field(values->DisplaySpriteCFlipPendingWaitEnable, 20, 20) |
+      __gen_field(values->ConditionCodeWaitSelect, 16, 19) |
+      __gen_field(values->DisplayPlaneCFlipPendingWaitEnable, 15, 15) |
+      __gen_field(values->DisplayPipeCScanLineWaitEnable, 14, 14) |
+      __gen_field(values->DisplayPipeBHorizontalBlankWaitEnable, 13, 13) |
+      __gen_field(values->DisplayPipeBVerticalBlankWaitEnable, 11, 11) |
+      __gen_field(values->DisplaySpriteBFlipPendingWaitEnable, 10, 10) |
+      __gen_field(values->DisplayPlaneBFlipPendingWaitEnable, 9, 9) |
+      __gen_field(values->DisplayPipeBScanLineWaitEnable, 8, 8) |
+      __gen_field(values->DisplayPipeAHorizontalBlankWaitEnable, 5, 5) |
+      __gen_field(values->DisplayPipeAVerticalBlankWaitEnable, 3, 3) |
+      __gen_field(values->DisplaySpriteAFlipPendingWaitEnable, 2, 2) |
+      __gen_field(values->DisplayPlaneAFlipPendingWaitEnable, 1, 1) |
+      __gen_field(values->DisplayPipeAScanLineWaitEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN75_PIPE_CONTROL_length_bias 0x00000002
+#define GEN75_PIPE_CONTROL_header               \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  2,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  3
+
+#define GEN75_PIPE_CONTROL_length 0x00000005
+
+struct GEN75_PIPE_CONTROL {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     DAT_PPGTT                                          0
+#define     DAT_GGTT                                           1
+   uint32_t                                     DestinationAddressType;
+#define     NoLRIOperation                                     0
+#define     MMIOWriteImmediateData                             1
+   uint32_t                                     LRIPostSyncOperation;
+   uint32_t                                     StoreDataIndex;
+   uint32_t                                     CommandStreamerStallEnable;
+#define     DontReset                                          0
+#define     Reset                                              1
+   uint32_t                                     GlobalSnapshotCountReset;
+   uint32_t                                     TLBInvalidate;
+   bool                                         GenericMediaStateClear;
+#define     NoWrite                                            0
+#define     WriteImmediateData                                 1
+#define     WritePSDepthCount                                  2
+#define     WriteTimestamp                                     3
+   uint32_t                                     PostSyncOperation;
+   bool                                         DepthStallEnable;
+#define     DisableFlush                                       0
+#define     EnableFlush                                        1
+   bool                                         RenderTargetCacheFlushEnable;
+   bool                                         InstructionCacheInvalidateEnable;
+   bool                                         TextureCacheInvalidationEnable;
+   bool                                         IndirectStatePointersDisable;
+   bool                                         NotifyEnable;
+   bool                                         PipeControlFlushEnable;
+   bool                                         DCFlushEnable;
+   bool                                         VFCacheInvalidationEnable;
+   bool                                         ConstantCacheInvalidationEnable;
+   bool                                         StateCacheInvalidationEnable;
+   bool                                         StallAtPixelScoreboard;
+#define     FlushDisabled                                      0
+#define     FlushEnabled                                       1
+   bool                                         DepthCacheFlushEnable;
+   __gen_address_type                           Address;
+   uint32_t                                     ImmediateData;
+   uint32_t                                     ImmediateData0;
+};
+
+static inline void
+GEN75_PIPE_CONTROL_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_PIPE_CONTROL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DestinationAddressType, 24, 24) |
+      __gen_field(values->LRIPostSyncOperation, 23, 23) |
+      __gen_field(values->StoreDataIndex, 21, 21) |
+      __gen_field(values->CommandStreamerStallEnable, 20, 20) |
+      __gen_field(values->GlobalSnapshotCountReset, 19, 19) |
+      __gen_field(values->TLBInvalidate, 18, 18) |
+      __gen_field(values->GenericMediaStateClear, 16, 16) |
+      __gen_field(values->PostSyncOperation, 14, 15) |
+      __gen_field(values->DepthStallEnable, 13, 13) |
+      __gen_field(values->RenderTargetCacheFlushEnable, 12, 12) |
+      __gen_field(values->InstructionCacheInvalidateEnable, 11, 11) |
+      __gen_field(values->TextureCacheInvalidationEnable, 10, 10) |
+      __gen_field(values->IndirectStatePointersDisable, 9, 9) |
+      __gen_field(values->NotifyEnable, 8, 8) |
+      __gen_field(values->PipeControlFlushEnable, 7, 7) |
+      __gen_field(values->DCFlushEnable, 5, 5) |
+      __gen_field(values->VFCacheInvalidationEnable, 4, 4) |
+      __gen_field(values->ConstantCacheInvalidationEnable, 3, 3) |
+      __gen_field(values->StateCacheInvalidationEnable, 2, 2) |
+      __gen_field(values->StallAtPixelScoreboard, 1, 1) |
+      __gen_field(values->DepthCacheFlushEnable, 0, 0) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->Address, dw2);
+
+   dw[3] =
+      __gen_field(values->ImmediateData, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->ImmediateData, 0, 31) |
+      0;
+
+}
+
+#define GEN75_SCISSOR_RECT_length 0x00000002
+
+struct GEN75_SCISSOR_RECT {
+   uint32_t                                     ScissorRectangleYMin;
+   uint32_t                                     ScissorRectangleXMin;
+   uint32_t                                     ScissorRectangleYMax;
+   uint32_t                                     ScissorRectangleXMax;
+};
+
+static inline void
+GEN75_SCISSOR_RECT_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN75_SCISSOR_RECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ScissorRectangleYMin, 16, 31) |
+      __gen_field(values->ScissorRectangleXMin, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ScissorRectangleYMax, 16, 31) |
+      __gen_field(values->ScissorRectangleXMax, 0, 15) |
+      0;
+
+}
+
+#define GEN75_SF_CLIP_VIEWPORT_length 0x00000010
+
+struct GEN75_SF_CLIP_VIEWPORT {
+   float                                        ViewportMatrixElementm00;
+   float                                        ViewportMatrixElementm11;
+   float                                        ViewportMatrixElementm22;
+   float                                        ViewportMatrixElementm30;
+   float                                        ViewportMatrixElementm31;
+   float                                        ViewportMatrixElementm32;
+   float                                        XMinClipGuardband;
+   float                                        XMaxClipGuardband;
+   float                                        YMinClipGuardband;
+   float                                        YMaxClipGuardband;
+};
+
+static inline void
+GEN75_SF_CLIP_VIEWPORT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN75_SF_CLIP_VIEWPORT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->ViewportMatrixElementm00) |
+      0;
+
+   dw[1] =
+      __gen_float(values->ViewportMatrixElementm11) |
+      0;
+
+   dw[2] =
+      __gen_float(values->ViewportMatrixElementm22) |
+      0;
+
+   dw[3] =
+      __gen_float(values->ViewportMatrixElementm30) |
+      0;
+
+   dw[4] =
+      __gen_float(values->ViewportMatrixElementm31) |
+      0;
+
+   dw[5] =
+      __gen_float(values->ViewportMatrixElementm32) |
+      0;
+
+   dw[6] =
+      0;
+
+   dw[7] =
+      0;
+
+   dw[8] =
+      __gen_float(values->XMinClipGuardband) |
+      0;
+
+   dw[9] =
+      __gen_float(values->XMaxClipGuardband) |
+      0;
+
+   dw[10] =
+      __gen_float(values->YMinClipGuardband) |
+      0;
+
+   dw[11] =
+      __gen_float(values->YMaxClipGuardband) |
+      0;
+
+   for (uint32_t i = 0, j = 12; i < 4; i += 1, j++) {
+      dw[j] =
+         0;
+   }
+
+}
+
+#define GEN75_BLEND_STATE_length 0x00000002
+
+struct GEN75_BLEND_STATE {
+   bool                                         ColorBufferBlendEnable;
+   bool                                         IndependentAlphaBlendEnable;
+#define     BLENDFUNCTION_ADD                                  0
+#define     BLENDFUNCTION_SUBTRACT                             1
+#define     BLENDFUNCTION_REVERSE_SUBTRACT                     2
+#define     BLENDFUNCTION_MIN                                  3
+#define     BLENDFUNCTION_MAX                                  4
+   uint32_t                                     AlphaBlendFunction;
+#define     BLENDFACTOR_ONE                                    1
+#define     BLENDFACTOR_SRC_COLOR                              2
+#define     BLENDFACTOR_SRC_ALPHA                              3
+#define     BLENDFACTOR_DST_ALPHA                              4
+#define     BLENDFACTOR_DST_COLOR                              5
+#define     BLENDFACTOR_SRC_ALPHA_SATURATE                     6
+#define     BLENDFACTOR_CONST_COLOR                            7
+#define     BLENDFACTOR_CONST_ALPHA                            8
+#define     BLENDFACTOR_SRC1_COLOR                             9
+#define     BLENDFACTOR_SRC1_ALPHA                            10
+#define     BLENDFACTOR_ZERO                                  17
+#define     BLENDFACTOR_INV_SRC_COLOR                         18
+#define     BLENDFACTOR_INV_SRC_ALPHA                         19
+#define     BLENDFACTOR_INV_DST_ALPHA                         20
+#define     BLENDFACTOR_INV_DST_COLOR                         21
+#define     BLENDFACTOR_INV_CONST_COLOR                       23
+#define     BLENDFACTOR_INV_CONST_ALPHA                       24
+#define     BLENDFACTOR_INV_SRC1_COLOR                        25
+#define     BLENDFACTOR_INV_SRC1_ALPHA                        26
+   uint32_t                                     SourceAlphaBlendFactor;
+   uint32_t                                     DestinationAlphaBlendFactor;
+#define     BLENDFUNCTION_ADD                                  0
+#define     BLENDFUNCTION_SUBTRACT                             1
+#define     BLENDFUNCTION_REVERSE_SUBTRACT                     2
+#define     BLENDFUNCTION_MIN                                  3
+#define     BLENDFUNCTION_MAX                                  4
+   uint32_t                                     ColorBlendFunction;
+   uint32_t                                     SourceBlendFactor;
+   uint32_t                                     DestinationBlendFactor;
+   bool                                         AlphaToCoverageEnable;
+   bool                                         AlphaToOneEnable;
+   bool                                         AlphaToCoverageDitherEnable;
+   bool                                         WriteDisableAlpha;
+   bool                                         WriteDisableRed;
+   bool                                         WriteDisableGreen;
+   bool                                         WriteDisableBlue;
+   bool                                         LogicOpEnable;
+#define     LOGICOP_CLEAR                                      0
+#define     LOGICOP_NOR                                        1
+#define     LOGICOP_AND_INVERTED                               2
+#define     LOGICOP_COPY_INVERTED                              3
+#define     LOGICOP_AND_REVERSE                                4
+#define     LOGICOP_INVERT                                     5
+#define     LOGICOP_XOR                                        6
+#define     LOGICOP_NAND                                       7
+#define     LOGICOP_AND                                        8
+#define     LOGICOP_EQUIV                                      9
+#define     LOGICOP_NOOP                                      10
+#define     LOGICOP_OR_INVERTED                               11
+#define     LOGICOP_COPY                                      12
+#define     LOGICOP_OR_REVERSE                                13
+#define     LOGICOP_OR                                        14
+#define     LOGICOP_SET                                       15
+   uint32_t                                     LogicOpFunction;
+   bool                                         AlphaTestEnable;
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+   uint32_t                                     AlphaTestFunction;
+   bool                                         ColorDitherEnable;
+   uint32_t                                     XDitherOffset;
+   uint32_t                                     YDitherOffset;
+#define     COLORCLAMP_UNORM                                   0
+#define     COLORCLAMP_SNORM                                   1
+#define     COLORCLAMP_RTFORMAT                                2
+   uint32_t                                     ColorClampRange;
+   bool                                         PreBlendColorClampEnable;
+   bool                                         PostBlendColorClampEnable;
+};
+
+static inline void
+GEN75_BLEND_STATE_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN75_BLEND_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ColorBufferBlendEnable, 31, 31) |
+      __gen_field(values->IndependentAlphaBlendEnable, 30, 30) |
+      __gen_field(values->AlphaBlendFunction, 26, 28) |
+      __gen_field(values->SourceAlphaBlendFactor, 20, 24) |
+      __gen_field(values->DestinationAlphaBlendFactor, 15, 19) |
+      __gen_field(values->ColorBlendFunction, 11, 13) |
+      __gen_field(values->SourceBlendFactor, 5, 9) |
+      __gen_field(values->DestinationBlendFactor, 0, 4) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AlphaToCoverageEnable, 31, 31) |
+      __gen_field(values->AlphaToOneEnable, 30, 30) |
+      __gen_field(values->AlphaToCoverageDitherEnable, 29, 29) |
+      __gen_field(values->WriteDisableAlpha, 27, 27) |
+      __gen_field(values->WriteDisableRed, 26, 26) |
+      __gen_field(values->WriteDisableGreen, 25, 25) |
+      __gen_field(values->WriteDisableBlue, 24, 24) |
+      __gen_field(values->LogicOpEnable, 22, 22) |
+      __gen_field(values->LogicOpFunction, 18, 21) |
+      __gen_field(values->AlphaTestEnable, 16, 16) |
+      __gen_field(values->AlphaTestFunction, 13, 15) |
+      __gen_field(values->ColorDitherEnable, 12, 12) |
+      __gen_field(values->XDitherOffset, 10, 11) |
+      __gen_field(values->YDitherOffset, 8, 9) |
+      __gen_field(values->ColorClampRange, 2, 3) |
+      __gen_field(values->PreBlendColorClampEnable, 1, 1) |
+      __gen_field(values->PostBlendColorClampEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN75_CC_VIEWPORT_length 0x00000002
+
+struct GEN75_CC_VIEWPORT {
+   float                                        MinimumDepth;
+   float                                        MaximumDepth;
+};
+
+static inline void
+GEN75_CC_VIEWPORT_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN75_CC_VIEWPORT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->MinimumDepth) |
+      0;
+
+   dw[1] =
+      __gen_float(values->MaximumDepth) |
+      0;
+
+}
+
+#define GEN75_COLOR_CALC_STATE_length 0x00000006
+
+struct GEN75_COLOR_CALC_STATE {
+   uint32_t                                     StencilReferenceValue;
+   uint32_t                                     BackFaceStencilReferenceValue;
+#define     Cancelled                                          0
+#define     NotCancelled                                       1
+   uint32_t                                     RoundDisableFunctionDisable;
+#define     ALPHATEST_UNORM8                                   0
+#define     ALPHATEST_FLOAT32                                  1
+   uint32_t                                     AlphaTestFormat;
+   uint32_t                                     AlphaReferenceValueAsUNORM8;
+   float                                        AlphaReferenceValueAsFLOAT32;
+   float                                        BlendConstantColorRed;
+   float                                        BlendConstantColorGreen;
+   float                                        BlendConstantColorBlue;
+   float                                        BlendConstantColorAlpha;
+};
+
+static inline void
+GEN75_COLOR_CALC_STATE_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN75_COLOR_CALC_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->StencilReferenceValue, 24, 31) |
+      __gen_field(values->BackFaceStencilReferenceValue, 16, 23) |
+      __gen_field(values->RoundDisableFunctionDisable, 15, 15) |
+      __gen_field(values->AlphaTestFormat, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AlphaReferenceValueAsUNORM8, 0, 31) |
+      __gen_float(values->AlphaReferenceValueAsFLOAT32) |
+      0;
+
+   dw[2] =
+      __gen_float(values->BlendConstantColorRed) |
+      0;
+
+   dw[3] =
+      __gen_float(values->BlendConstantColorGreen) |
+      0;
+
+   dw[4] =
+      __gen_float(values->BlendConstantColorBlue) |
+      0;
+
+   dw[5] =
+      __gen_float(values->BlendConstantColorAlpha) |
+      0;
+
+}
+
+#define GEN75_DEPTH_STENCIL_STATE_length 0x00000003
+
+struct GEN75_DEPTH_STENCIL_STATE {
+   bool                                         StencilTestEnable;
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+   uint32_t                                     StencilTestFunction;
+#define     STENCILOP_KEEP                                     0
+#define     STENCILOP_ZERO                                     1
+#define     STENCILOP_REPLACE                                  2
+#define     STENCILOP_INCRSAT                                  3
+#define     STENCILOP_DECRSAT                                  4
+#define     STENCILOP_INCR                                     5
+#define     STENCILOP_DECR                                     6
+#define     STENCILOP_INVERT                                   7
+   uint32_t                                     StencilFailOp;
+   uint32_t                                     StencilPassDepthFailOp;
+   uint32_t                                     StencilPassDepthPassOp;
+   bool                                         StencilBufferWriteEnable;
+   bool                                         DoubleSidedStencilEnable;
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+   uint32_t                                     BackFaceStencilTestFunction;
+#define     STENCILOP_KEEP                                     0
+#define     STENCILOP_ZERO                                     1
+#define     STENCILOP_REPLACE                                  2
+#define     STENCILOP_INCRSAT                                  3
+#define     STENCILOP_DECRSAT                                  4
+#define     STENCILOP_INCR                                     5
+#define     STENCILOP_DECR                                     6
+#define     STENCILOP_INVERT                                   7
+   uint32_t                                     BackfaceStencilFailOp;
+   uint32_t                                     BackfaceStencilPassDepthFailOp;
+   uint32_t                                     BackfaceStencilPassDepthPassOp;
+   uint32_t                                     StencilTestMask;
+   uint32_t                                     StencilWriteMask;
+   uint32_t                                     BackfaceStencilTestMask;
+   uint32_t                                     BackfaceStencilWriteMask;
+   bool                                         DepthTestEnable;
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+   uint32_t                                     DepthTestFunction;
+   bool                                         DepthBufferWriteEnable;
+};
+
+static inline void
+GEN75_DEPTH_STENCIL_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_DEPTH_STENCIL_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->StencilTestEnable, 31, 31) |
+      __gen_field(values->StencilTestFunction, 28, 30) |
+      __gen_field(values->StencilFailOp, 25, 27) |
+      __gen_field(values->StencilPassDepthFailOp, 22, 24) |
+      __gen_field(values->StencilPassDepthPassOp, 19, 21) |
+      __gen_field(values->StencilBufferWriteEnable, 18, 18) |
+      __gen_field(values->DoubleSidedStencilEnable, 15, 15) |
+      __gen_field(values->BackFaceStencilTestFunction, 12, 14) |
+      __gen_field(values->BackfaceStencilFailOp, 9, 11) |
+      __gen_field(values->BackfaceStencilPassDepthFailOp, 6, 8) |
+      __gen_field(values->BackfaceStencilPassDepthPassOp, 3, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StencilTestMask, 24, 31) |
+      __gen_field(values->StencilWriteMask, 16, 23) |
+      __gen_field(values->BackfaceStencilTestMask, 8, 15) |
+      __gen_field(values->BackfaceStencilWriteMask, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DepthTestEnable, 31, 31) |
+      __gen_field(values->DepthTestFunction, 27, 29) |
+      __gen_field(values->DepthBufferWriteEnable, 26, 26) |
+      0;
+
+}
+
+#define GEN75_INTERFACE_DESCRIPTOR_DATA_length 0x00000008
+
+struct GEN75_INTERFACE_DESCRIPTOR_DATA {
+   uint32_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleProgramFlow;
+#define     NormalPriority                                     0
+#define     HighPriority                                       1
+   uint32_t                                     ThreadPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     SamplerStatePointer;
+#define     Nosamplersused                                     0
+#define     Between1and4samplersused                           1
+#define     Between5and8samplersused                           2
+#define     Between9and12samplersused                          3
+#define     Between13and16samplersused                         4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTablePointer;
+   uint32_t                                     BindingTableEntryCount;
+   uint32_t                                     ConstantURBEntryReadLength;
+#define     RTNE                                               0
+#define     RU                                                 1
+#define     RD                                                 2
+#define     RTZ                                                3
+   uint32_t                                     RoundingMode;
+   bool                                         BarrierEnable;
+   uint32_t                                     SharedLocalMemorySize;
+   uint32_t                                     NumberofThreadsinGPGPUThreadGroup;
+   uint32_t                                     CrossThreadConstantDataReadLength;
+};
+
+static inline void
+GEN75_INTERFACE_DESCRIPTOR_DATA_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN75_INTERFACE_DESCRIPTOR_DATA * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SingleProgramFlow, 18, 18) |
+      __gen_field(values->ThreadPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->SamplerStatePointer, 5, 31) |
+      __gen_field(values->SamplerCount, 2, 4) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->BindingTablePointer, 5, 15) |
+      __gen_field(values->BindingTableEntryCount, 0, 4) |
+      0;
+
+   dw[4] =
+      __gen_field(values->ConstantURBEntryReadLength, 16, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->RoundingMode, 22, 23) |
+      __gen_field(values->BarrierEnable, 21, 21) |
+      __gen_field(values->SharedLocalMemorySize, 16, 20) |
+      __gen_field(values->NumberofThreadsinGPGPUThreadGroup, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->CrossThreadConstantDataReadLength, 0, 7) |
+      0;
+
+   dw[7] =
+      0;
+
+}
+
+#define GEN75_BINDING_TABLE_STATE_length 0x00000001
+
+struct GEN75_BINDING_TABLE_STATE {
+   uint32_t                                     SurfaceStatePointer;
+};
+
+static inline void
+GEN75_BINDING_TABLE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN75_BINDING_TABLE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->SurfaceStatePointer, 5, 31) |
+      0;
+
+}
+
+#define GEN75_RENDER_SURFACE_STATE_length 0x00000008
+
+struct GEN75_RENDER_SURFACE_STATE {
+#define     SURFTYPE_1D                                        0
+#define     SURFTYPE_2D                                        1
+#define     SURFTYPE_3D                                        2
+#define     SURFTYPE_CUBE                                      3
+#define     SURFTYPE_BUFFER                                    4
+#define     SURFTYPE_STRBUF                                    5
+#define     SURFTYPE_NULL                                      7
+   uint32_t                                     SurfaceType;
+   bool                                         SurfaceArray;
+   uint32_t                                     SurfaceFormat;
+#define     VALIGN_2                                           0
+#define     VALIGN_4                                           1
+   uint32_t                                     SurfaceVerticalAlignment;
+#define     HALIGN_4                                           0
+#define     HALIGN_8                                           1
+   uint32_t                                     SurfaceHorizontalAlignment;
+   uint32_t                                     TiledSurface;
+#define     TILEWALK_XMAJOR                                    0
+#define     TILEWALK_YMAJOR                                    1
+   uint32_t                                     TileWalk;
+   uint32_t                                     VerticalLineStride;
+   uint32_t                                     VerticalLineStrideOffset;
+#define     ARYSPC_FULL                                        0
+#define     ARYSPC_LOD0                                        1
+   uint32_t                                     SurfaceArraySpacing;
+   uint32_t                                     RenderCacheReadWriteMode;
+#define     NORMAL_MODE                                        0
+#define     PROGRESSIVE_FRAME                                  2
+#define     INTERLACED_FRAME                                   3
+   uint32_t                                     MediaBoundaryPixelMode;
+   uint32_t                                     CubeFaceEnables;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     Height;
+   uint32_t                                     Width;
+   uint32_t                                     Depth;
+   uint32_t                                     IntegerSurfaceFormat;
+   uint32_t                                     SurfacePitch;
+#define     RTROTATE_0DEG                                      0
+#define     RTROTATE_90DEG                                     1
+#define     RTROTATE_270DEG                                    3
+   uint32_t                                     RenderTargetRotation;
+   uint32_t                                     MinimumArrayElement;
+   uint32_t                                     RenderTargetViewExtent;
+#define     MSFMT_MSS                                          0
+#define     MSFMT_DEPTH_STENCIL                                1
+   uint32_t                                     MultisampledSurfaceStorageFormat;
+#define     MULTISAMPLECOUNT_1                                 0
+#define     MULTISAMPLECOUNT_4                                 2
+#define     MULTISAMPLECOUNT_8                                 3
+   uint32_t                                     NumberofMultisamples;
+   uint32_t                                     MultisamplePositionPaletteIndex;
+   uint32_t                                     MinimumArrayElement0;
+   uint32_t                                     XOffset;
+   uint32_t                                     YOffset;
+   struct GEN75_MEMORY_OBJECT_CONTROL_STATE     SurfaceObjectControlState;
+   uint32_t                                     SurfaceMinLOD;
+   uint32_t                                     MIPCountLOD;
+   __gen_address_type                           MCSBaseAddress;
+   uint32_t                                     MCSSurfacePitch;
+   __gen_address_type                           AppendCounterAddress;
+   bool                                         AppendCounterEnable;
+   bool                                         MCSEnable;
+   uint32_t                                     XOffsetforUVPlane;
+   uint32_t                                     YOffsetforUVPlane;
+#define     SCS_ZERO                                           0
+#define     SCS_ONE                                            1
+#define     SCS_RED                                            4
+#define     SCS_GREEN                                          5
+#define     SCS_BLUE                                           6
+#define     SCS_ALPHA                                          7
+   uint32_t                                     ShaderChannelSelectR;
+   uint32_t                                     ShaderChannelSelectG;
+   uint32_t                                     ShaderChannelSelectB;
+   uint32_t                                     ShaderChannelSelectA;
+   float                                        ResourceMinLOD;
+};
+
+static inline void
+GEN75_RENDER_SURFACE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN75_RENDER_SURFACE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->SurfaceType, 29, 31) |
+      __gen_field(values->SurfaceArray, 28, 28) |
+      __gen_field(values->SurfaceFormat, 18, 26) |
+      __gen_field(values->SurfaceVerticalAlignment, 16, 17) |
+      __gen_field(values->SurfaceHorizontalAlignment, 15, 15) |
+      __gen_field(values->TiledSurface, 14, 14) |
+      __gen_field(values->TileWalk, 13, 13) |
+      __gen_field(values->VerticalLineStride, 12, 12) |
+      __gen_field(values->VerticalLineStrideOffset, 11, 11) |
+      __gen_field(values->SurfaceArraySpacing, 10, 10) |
+      __gen_field(values->RenderCacheReadWriteMode, 8, 8) |
+      __gen_field(values->MediaBoundaryPixelMode, 6, 7) |
+      __gen_field(values->CubeFaceEnables, 0, 5) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->SurfaceBaseAddress, dw1);
+
+   dw[2] =
+      __gen_field(values->Height, 16, 29) |
+      __gen_field(values->Width, 0, 13) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Depth, 21, 31) |
+      __gen_field(values->IntegerSurfaceFormat, 18, 20) |
+      __gen_field(values->SurfacePitch, 0, 17) |
+      0;
+
+   dw[4] =
+      __gen_field(values->RenderTargetRotation, 29, 30) |
+      __gen_field(values->MinimumArrayElement, 18, 28) |
+      __gen_field(values->RenderTargetViewExtent, 7, 17) |
+      __gen_field(values->MultisampledSurfaceStorageFormat, 6, 6) |
+      __gen_field(values->NumberofMultisamples, 3, 5) |
+      __gen_field(values->MultisamplePositionPaletteIndex, 0, 2) |
+      __gen_field(values->MinimumArrayElement, 0, 26) |
+      0;
+
+   uint32_t dw_SurfaceObjectControlState;
+   GEN75_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceObjectControlState, &values->SurfaceObjectControlState);
+   dw[5] =
+      __gen_offset(values->XOffset, 25, 31) |
+      __gen_offset(values->YOffset, 20, 23) |
+      __gen_field(dw_SurfaceObjectControlState, 16, 19) |
+      __gen_field(values->SurfaceMinLOD, 4, 7) |
+      __gen_field(values->MIPCountLOD, 0, 3) |
+      0;
+
+   uint32_t dw6 =
+      __gen_field(values->MCSSurfacePitch, 3, 11) |
+      __gen_field(values->AppendCounterEnable, 1, 1) |
+      __gen_field(values->MCSEnable, 0, 0) |
+      __gen_field(values->XOffsetforUVPlane, 16, 29) |
+      __gen_field(values->YOffsetforUVPlane, 0, 13) |
+      0;
+
+   dw[6] =
+      __gen_combine_address(data, &dw[6], values->AppendCounterAddress, dw6);
+
+   dw[7] =
+      __gen_field(values->ShaderChannelSelectR, 25, 27) |
+      __gen_field(values->ShaderChannelSelectG, 22, 24) |
+      __gen_field(values->ShaderChannelSelectB, 19, 21) |
+      __gen_field(values->ShaderChannelSelectA, 16, 18) |
+      __gen_field(values->ResourceMinLOD * (1 << 8), 0, 11) |
+      0;
+
+}
+
+#define GEN75_SAMPLER_BORDER_COLOR_STATE_length 0x00000014
+
+#define GEN75_BORDER_COLOR_UINT32_SINT32_length 0x00000004
+
+struct GEN75_BORDER_COLOR_UINT32_SINT32 {
+   uint32_t                                     BorderColorRedui32integerunclamp;
+   uint32_t                                     BorderColorRedsi32integerunclamp;
+   uint32_t                                     BorderColorGreenui32integerunclamp;
+   uint32_t                                     BorderColorGreensi32integerunclamp;
+   uint32_t                                     BorderColorBlueui32integerunclamp;
+   uint32_t                                     BorderColorBluesi32integerunclamp;
+   uint32_t                                     BorderColorGreenui32integerunclamp0;
+   uint32_t                                     BorderColorGreensi32integerunclamp0;
+   uint32_t                                     BorderColorAlphaui32integerunclamp;
+   uint32_t                                     BorderColorAlphasi32integerunclamp;
+};
+
+static inline void
+GEN75_BORDER_COLOR_UINT32_SINT32_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_BORDER_COLOR_UINT32_SINT32 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->BorderColorRedui32integerunclamp, 0, 31) |
+      __gen_field(values->BorderColorRedsi32integerunclamp, 0, 31) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BorderColorGreenui32integerunclamp, 0, 31) |
+      __gen_field(values->BorderColorGreensi32integerunclamp, 0, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->BorderColorBlueui32integerunclamp, 0, 31) |
+      __gen_field(values->BorderColorBluesi32integerunclamp, 0, 31) |
+      __gen_field(values->BorderColorGreenui32integerunclamp, 0, 31) |
+      __gen_field(values->BorderColorGreensi32integerunclamp, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->BorderColorAlphaui32integerunclamp, 0, 31) |
+      __gen_field(values->BorderColorAlphasi32integerunclamp, 0, 31) |
+      0;
+
+}
+
+#define GEN75_BORDER_COLOR_UINT16_SINT16_length 0x00000004
+
+struct GEN75_BORDER_COLOR_UINT16_SINT16 {
+   uint32_t                                     BorderColorGreenclamptouint16;
+   uint32_t                                     BorderColorGreenclamptosint16;
+   uint32_t                                     BorderColorRedclamptouint16;
+   uint32_t                                     BorderColorRedclamptosint16;
+   uint32_t                                     BorderColorAlphaclamptouint16;
+   uint32_t                                     BorderColorAlphaclamptosint16;
+   uint32_t                                     BorderColorBlueclamptouint16;
+   uint32_t                                     BorderColorBlueclamptosint16;
+};
+
+static inline void
+GEN75_BORDER_COLOR_UINT16_SINT16_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_BORDER_COLOR_UINT16_SINT16 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->BorderColorGreenclamptouint16, 16, 31) |
+      __gen_field(values->BorderColorGreenclamptosint16, 16, 31) |
+      __gen_field(values->BorderColorRedclamptouint16, 0, 15) |
+      __gen_field(values->BorderColorRedclamptosint16, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->BorderColorAlphaclamptouint16, 16, 31) |
+      __gen_field(values->BorderColorAlphaclamptosint16, 16, 31) |
+      __gen_field(values->BorderColorBlueclamptouint16, 0, 15) |
+      __gen_field(values->BorderColorBlueclamptosint16, 0, 15) |
+      0;
+
+   dw[3] =
+      0;
+
+}
+
+#define GEN75_BORDER_COLOR_UINT8_SINT8_length 0x00000004
+
+struct GEN75_BORDER_COLOR_UINT8_SINT8 {
+   uint32_t                                     BorderColorAlphaclamptouint8;
+   uint32_t                                     BorderColorAlphaclamptosint8;
+   uint32_t                                     BorderColorBlueclamptouint8;
+   uint32_t                                     BorderColorBlueclamptosint8;
+   uint32_t                                     BorderColorGreenclamptouint8;
+   uint32_t                                     BorderColorGreenclamptosint8;
+   uint32_t                                     BorderRedAlphaclamptouint8;
+   uint32_t                                     BorderRedAlphaclamptosint8;
+};
+
+static inline void
+GEN75_BORDER_COLOR_UINT8_SINT8_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN75_BORDER_COLOR_UINT8_SINT8 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->BorderColorAlphaclamptouint8, 24, 31) |
+      __gen_field(values->BorderColorAlphaclamptosint8, 24, 31) |
+      __gen_field(values->BorderColorBlueclamptouint8, 16, 23) |
+      __gen_field(values->BorderColorBlueclamptosint8, 16, 23) |
+      __gen_field(values->BorderColorGreenclamptouint8, 8, 15) |
+      __gen_field(values->BorderColorGreenclamptosint8, 8, 15) |
+      __gen_field(values->BorderRedAlphaclamptouint8, 0, 7) |
+      __gen_field(values->BorderRedAlphaclamptosint8, 0, 7) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      0;
+
+   dw[3] =
+      0;
+
+}
+
+struct GEN75_SAMPLER_BORDER_COLOR_STATE {
+   float                                        BorderColorRedDX100GL;
+   uint32_t                                     BorderColorAlpha;
+   uint32_t                                     BorderColorBlue;
+   uint32_t                                     BorderColorGreen;
+   uint32_t                                     BorderColorRedDX9;
+   float                                        BorderColorGreen0;
+   float                                        BorderColorBlue0;
+   float                                        BorderColorAlpha0;
+   struct GEN75_BORDER_COLOR_UINT32_SINT32      BorderColor;
+   struct GEN75_BORDER_COLOR_UINT16_SINT16      BorderColor0;
+   struct GEN75_BORDER_COLOR_UINT8_SINT8        BorderColor1;
+};
+
+static inline void
+GEN75_SAMPLER_BORDER_COLOR_STATE_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN75_SAMPLER_BORDER_COLOR_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->BorderColorRedDX100GL) |
+      __gen_field(values->BorderColorAlpha, 24, 31) |
+      __gen_field(values->BorderColorBlue, 16, 23) |
+      __gen_field(values->BorderColorGreen, 8, 15) |
+      __gen_field(values->BorderColorRedDX9, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_float(values->BorderColorGreen) |
+      0;
+
+   dw[2] =
+      __gen_float(values->BorderColorBlue) |
+      0;
+
+   dw[3] =
+      __gen_float(values->BorderColorAlpha) |
+      0;
+
+   for (uint32_t i = 0, j = 4; i < 12; i += 1, j++) {
+      dw[j] =
+         0;
+   }
+
+   GEN75_BORDER_COLOR_UINT32_SINT32_pack(data, &dw[16], &values->BorderColor);
+}
+
+#define GEN75_SAMPLER_STATE_length 0x00000004
+
+struct GEN75_SAMPLER_STATE {
+   bool                                         SamplerDisable;
+#define     DX10OGL                                            0
+#define     DX9                                                1
+   uint32_t                                     TextureBorderColorMode;
+#define     OGL                                                1
+   uint32_t                                     LODPreClampEnable;
+   float                                        BaseMipLevel;
+#define     MIPFILTER_NONE                                     0
+#define     MIPFILTER_NEAREST                                  1
+#define     MIPFILTER_LINEAR                                   3
+   uint32_t                                     MipModeFilter;
+#define     MAPFILTER_NEAREST                                  0
+#define     MAPFILTER_LINEAR                                   1
+#define     MAPFILTER_ANISOTROPIC                              2
+#define     MAPFILTER_MONO                                     6
+   uint32_t                                     MagModeFilter;
+#define     MAPFILTER_NEAREST                                  0
+#define     MAPFILTER_LINEAR                                   1
+#define     MAPFILTER_ANISOTROPIC                              2
+#define     MAPFILTER_MONO                                     6
+   uint32_t                                     MinModeFilter;
+   float                                        TextureLODBias;
+#define     LEGACY                                             0
+#define     EWAApproximation                                   1
+   uint32_t                                     AnisotropicAlgorithm;
+   float                                        MinLOD;
+   float                                        MaxLOD;
+#define     PREFILTEROPALWAYS                                  0
+#define     PREFILTEROPNEVER                                   1
+#define     PREFILTEROPLESS                                    2
+#define     PREFILTEROPEQUAL                                   3
+#define     PREFILTEROPLEQUAL                                  4
+#define     PREFILTEROPGREATER                                 5
+#define     PREFILTEROPNOTEQUAL                                6
+#define     PREFILTEROPGEQUAL                                  7
+   uint32_t                                     ShadowFunction;
+#define     PROGRAMMED                                         0
+#define     OVERRIDE                                           1
+   uint32_t                                     CubeSurfaceControlMode;
+   uint32_t                                     BorderColorPointer;
+   bool                                         ChromaKeyEnable;
+   uint32_t                                     ChromaKeyIndex;
+#define     KEYFILTER_KILL_ON_ANY_MATCH                        0
+#define     KEYFILTER_REPLACE_BLACK                            1
+   uint32_t                                     ChromaKeyMode;
+#define     RATIO21                                            0
+#define     RATIO41                                            1
+#define     RATIO61                                            2
+#define     RATIO81                                            3
+#define     RATIO101                                           4
+#define     RATIO121                                           5
+#define     RATIO141                                           6
+#define     RATIO161                                           7
+   uint32_t                                     MaximumAnisotropy;
+   bool                                         RAddressMinFilterRoundingEnable;
+   bool                                         RAddressMagFilterRoundingEnable;
+   bool                                         VAddressMinFilterRoundingEnable;
+   bool                                         VAddressMagFilterRoundingEnable;
+   bool                                         UAddressMinFilterRoundingEnable;
+   bool                                         UAddressMagFilterRoundingEnable;
+#define     FULL                                               0
+#define     TRIQUAL_HIGHMAG_CLAMP_MIPFILTER                    1
+#define     MED                                                2
+#define     LOW                                                3
+   uint32_t                                     TrilinearFilterQuality;
+   bool                                         NonnormalizedCoordinateEnable;
+   uint32_t                                     TCXAddressControlMode;
+   uint32_t                                     TCYAddressControlMode;
+   uint32_t                                     TCZAddressControlMode;
+};
+
+static inline void
+GEN75_SAMPLER_STATE_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN75_SAMPLER_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->SamplerDisable, 31, 31) |
+      __gen_field(values->TextureBorderColorMode, 29, 29) |
+      __gen_field(values->LODPreClampEnable, 28, 28) |
+      __gen_field(values->BaseMipLevel * (1 << 1), 22, 26) |
+      __gen_field(values->MipModeFilter, 20, 21) |
+      __gen_field(values->MagModeFilter, 17, 19) |
+      __gen_field(values->MinModeFilter, 14, 16) |
+      __gen_fixed(values->TextureLODBias, 1, 13, true, 8) |
+      __gen_field(values->AnisotropicAlgorithm, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MinLOD * (1 << 8), 20, 31) |
+      __gen_field(values->MaxLOD * (1 << 8), 8, 19) |
+      __gen_field(values->ShadowFunction, 1, 3) |
+      __gen_field(values->CubeSurfaceControlMode, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->BorderColorPointer, 5, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ChromaKeyEnable, 25, 25) |
+      __gen_field(values->ChromaKeyIndex, 23, 24) |
+      __gen_field(values->ChromaKeyMode, 22, 22) |
+      __gen_field(values->MaximumAnisotropy, 19, 21) |
+      __gen_field(values->RAddressMinFilterRoundingEnable, 13, 13) |
+      __gen_field(values->RAddressMagFilterRoundingEnable, 14, 14) |
+      __gen_field(values->VAddressMinFilterRoundingEnable, 15, 15) |
+      __gen_field(values->VAddressMagFilterRoundingEnable, 16, 16) |
+      __gen_field(values->UAddressMinFilterRoundingEnable, 17, 17) |
+      __gen_field(values->UAddressMagFilterRoundingEnable, 18, 18) |
+      __gen_field(values->TrilinearFilterQuality, 11, 12) |
+      __gen_field(values->NonnormalizedCoordinateEnable, 10, 10) |
+      __gen_field(values->TCXAddressControlMode, 6, 8) |
+      __gen_field(values->TCYAddressControlMode, 3, 5) |
+      __gen_field(values->TCZAddressControlMode, 0, 2) |
+      0;
+
+}
+
+/* Enum 3D_Prim_Topo_Type */
+#define     _3DPRIM_POINTLIST                                  1
+#define     _3DPRIM_LINELIST                                   2
+#define     _3DPRIM_LINESTRIP                                  3
+#define     _3DPRIM_TRILIST                                    4
+#define     _3DPRIM_TRISTRIP                                   5
+#define     _3DPRIM_TRIFAN                                     6
+#define     _3DPRIM_QUADLIST                                   7
+#define     _3DPRIM_QUADSTRIP                                  8
+#define     _3DPRIM_LINELIST_ADJ                               9
+#define     _3DPRIM_LINESTRIP_ADJ                             10
+#define     _3DPRIM_TRILIST_ADJ                               11
+#define     _3DPRIM_TRISTRIP_ADJ                              12
+#define     _3DPRIM_TRISTRIP_REVERSE                          13
+#define     _3DPRIM_POLYGON                                   14
+#define     _3DPRIM_RECTLIST                                  15
+#define     _3DPRIM_LINELOOP                                  16
+#define     _3DPRIM_POINTLIST_BF                              17
+#define     _3DPRIM_LINESTRIP_CONT                            18
+#define     _3DPRIM_LINESTRIP_BF                              19
+#define     _3DPRIM_LINESTRIP_CONT_BF                         20
+#define     _3DPRIM_TRIFAN_NOSTIPPLE                          22
+#define     _3DPRIM_PATCHLIST_1                               32
+#define     _3DPRIM_PATCHLIST_2                               33
+#define     _3DPRIM_PATCHLIST_3                               34
+#define     _3DPRIM_PATCHLIST_4                               35
+#define     _3DPRIM_PATCHLIST_5                               36
+#define     _3DPRIM_PATCHLIST_6                               37
+#define     _3DPRIM_PATCHLIST_7                               38
+#define     _3DPRIM_PATCHLIST_8                               39
+#define     _3DPRIM_PATCHLIST_9                               40
+#define     _3DPRIM_PATCHLIST_10                              41
+#define     _3DPRIM_PATCHLIST_11                              42
+#define     _3DPRIM_PATCHLIST_12                              43
+#define     _3DPRIM_PATCHLIST_13                              44
+#define     _3DPRIM_PATCHLIST_14                              45
+#define     _3DPRIM_PATCHLIST_15                              46
+#define     _3DPRIM_PATCHLIST_16                              47
+#define     _3DPRIM_PATCHLIST_17                              48
+#define     _3DPRIM_PATCHLIST_18                              49
+#define     _3DPRIM_PATCHLIST_19                              50
+#define     _3DPRIM_PATCHLIST_20                              51
+#define     _3DPRIM_PATCHLIST_21                              52
+#define     _3DPRIM_PATCHLIST_22                              53
+#define     _3DPRIM_PATCHLIST_23                              54
+#define     _3DPRIM_PATCHLIST_24                              55
+#define     _3DPRIM_PATCHLIST_25                              56
+#define     _3DPRIM_PATCHLIST_26                              57
+#define     _3DPRIM_PATCHLIST_27                              58
+#define     _3DPRIM_PATCHLIST_28                              59
+#define     _3DPRIM_PATCHLIST_29                              60
+#define     _3DPRIM_PATCHLIST_30                              61
+#define     _3DPRIM_PATCHLIST_31                              62
+#define     _3DPRIM_PATCHLIST_32                              63
+
+/* Enum 3D_Vertex_Component_Control */
+#define     VFCOMP_NOSTORE                                     0
+#define     VFCOMP_STORE_SRC                                   1
+#define     VFCOMP_STORE_0                                     2
+#define     VFCOMP_STORE_1_FP                                  3
+#define     VFCOMP_STORE_1_INT                                 4
+#define     VFCOMP_STORE_VID                                   5
+#define     VFCOMP_STORE_IID                                   6
+#define     VFCOMP_STORE_PID                                   7
+
+/* Enum 3D_Compare_Function */
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+
+/* Enum SURFACE_FORMAT */
+#define     R32G32B32A32_FLOAT                                 0
+#define     R32G32B32A32_SINT                                  1
+#define     R32G32B32A32_UINT                                  2
+#define     R32G32B32A32_UNORM                                 3
+#define     R32G32B32A32_SNORM                                 4
+#define     R64G64_FLOAT                                       5
+#define     R32G32B32X32_FLOAT                                 6
+#define     R32G32B32A32_SSCALED                               7
+#define     R32G32B32A32_USCALED                               8
+#define     R32G32B32A32_SFIXED                               32
+#define     R64G64_PASSTHRU                                   33
+#define     R32G32B32_FLOAT                                   64
+#define     R32G32B32_SINT                                    65
+#define     R32G32B32_UINT                                    66
+#define     R32G32B32_UNORM                                   67
+#define     R32G32B32_SNORM                                   68
+#define     R32G32B32_SSCALED                                 69
+#define     R32G32B32_USCALED                                 70
+#define     R32G32B32_SFIXED                                  80
+#define     R16G16B16A16_UNORM                               128
+#define     R16G16B16A16_SNORM                               129
+#define     R16G16B16A16_SINT                                130
+#define     R16G16B16A16_UINT                                131
+#define     R16G16B16A16_FLOAT                               132
+#define     R32G32_FLOAT                                     133
+#define     R32G32_SINT                                      134
+#define     R32G32_UINT                                      135
+#define     R32_FLOAT_X8X24_TYPELESS                         136
+#define     X32_TYPELESS_G8X24_UINT                          137
+#define     L32A32_FLOAT                                     138
+#define     R32G32_UNORM                                     139
+#define     R32G32_SNORM                                     140
+#define     R64_FLOAT                                        141
+#define     R16G16B16X16_UNORM                               142
+#define     R16G16B16X16_FLOAT                               143
+#define     A32X32_FLOAT                                     144
+#define     L32X32_FLOAT                                     145
+#define     I32X32_FLOAT                                     146
+#define     R16G16B16A16_SSCALED                             147
+#define     R16G16B16A16_USCALED                             148
+#define     R32G32_SSCALED                                   149
+#define     R32G32_USCALED                                   150
+#define     R32G32_SFIXED                                    160
+#define     R64_PASSTHRU                                     161
+#define     B8G8R8A8_UNORM                                   192
+#define     B8G8R8A8_UNORM_SRGB                              193
+#define     R10G10B10A2_UNORM                                194
+#define     R10G10B10A2_UNORM_SRGB                           195
+#define     R10G10B10A2_UINT                                 196
+#define     R10G10B10_SNORM_A2_UNORM                         197
+#define     R8G8B8A8_UNORM                                   199
+#define     R8G8B8A8_UNORM_SRGB                              200
+#define     R8G8B8A8_SNORM                                   201
+#define     R8G8B8A8_SINT                                    202
+#define     R8G8B8A8_UINT                                    203
+#define     R16G16_UNORM                                     204
+#define     R16G16_SNORM                                     205
+#define     R16G16_SINT                                      206
+#define     R16G16_UINT                                      207
+#define     R16G16_FLOAT                                     208
+#define     B10G10R10A2_UNORM                                209
+#define     B10G10R10A2_UNORM_SRGB                           210
+#define     R11G11B10_FLOAT                                  211
+#define     R32_SINT                                         214
+#define     R32_UINT                                         215
+#define     R32_FLOAT                                        216
+#define     R24_UNORM_X8_TYPELESS                            217
+#define     X24_TYPELESS_G8_UINT                             218
+#define     L32_UNORM                                        221
+#define     A32_UNORM                                        222
+#define     L16A16_UNORM                                     223
+#define     I24X8_UNORM                                      224
+#define     L24X8_UNORM                                      225
+#define     A24X8_UNORM                                      226
+#define     I32_FLOAT                                        227
+#define     L32_FLOAT                                        228
+#define     A32_FLOAT                                        229
+#define     X8B8_UNORM_G8R8_SNORM                            230
+#define     A8X8_UNORM_G8R8_SNORM                            231
+#define     B8X8_UNORM_G8R8_SNORM                            232
+#define     B8G8R8X8_UNORM                                   233
+#define     B8G8R8X8_UNORM_SRGB                              234
+#define     R8G8B8X8_UNORM                                   235
+#define     R8G8B8X8_UNORM_SRGB                              236
+#define     R9G9B9E5_SHAREDEXP                               237
+#define     B10G10R10X2_UNORM                                238
+#define     L16A16_FLOAT                                     240
+#define     R32_UNORM                                        241
+#define     R32_SNORM                                        242
+#define     R10G10B10X2_USCALED                              243
+#define     R8G8B8A8_SSCALED                                 244
+#define     R8G8B8A8_USCALED                                 245
+#define     R16G16_SSCALED                                   246
+#define     R16G16_USCALED                                   247
+#define     R32_SSCALED                                      248
+#define     R32_USCALED                                      249
+#define     B5G6R5_UNORM                                     256
+#define     B5G6R5_UNORM_SRGB                                257
+#define     B5G5R5A1_UNORM                                   258
+#define     B5G5R5A1_UNORM_SRGB                              259
+#define     B4G4R4A4_UNORM                                   260
+#define     B4G4R4A4_UNORM_SRGB                              261
+#define     R8G8_UNORM                                       262
+#define     R8G8_SNORM                                       263
+#define     R8G8_SINT                                        264
+#define     R8G8_UINT                                        265
+#define     R16_UNORM                                        266
+#define     R16_SNORM                                        267
+#define     R16_SINT                                         268
+#define     R16_UINT                                         269
+#define     R16_FLOAT                                        270
+#define     A8P8_UNORM_PALETTE0                              271
+#define     A8P8_UNORM_PALETTE1                              272
+#define     I16_UNORM                                        273
+#define     L16_UNORM                                        274
+#define     A16_UNORM                                        275
+#define     L8A8_UNORM                                       276
+#define     I16_FLOAT                                        277
+#define     L16_FLOAT                                        278
+#define     A16_FLOAT                                        279
+#define     L8A8_UNORM_SRGB                                  280
+#define     R5G5_SNORM_B6_UNORM                              281
+#define     B5G5R5X1_UNORM                                   282
+#define     B5G5R5X1_UNORM_SRGB                              283
+#define     R8G8_SSCALED                                     284
+#define     R8G8_USCALED                                     285
+#define     R16_SSCALED                                      286
+#define     R16_USCALED                                      287
+#define     P8A8_UNORM_PALETTE0                              290
+#define     P8A8_UNORM_PALETTE1                              291
+#define     A1B5G5R5_UNORM                                   292
+#define     A4B4G4R4_UNORM                                   293
+#define     L8A8_UINT                                        294
+#define     L8A8_SINT                                        295
+#define     R8_UNORM                                         320
+#define     R8_SNORM                                         321
+#define     R8_SINT                                          322
+#define     R8_UINT                                          323
+#define     A8_UNORM                                         324
+#define     I8_UNORM                                         325
+#define     L8_UNORM                                         326
+#define     P4A4_UNORM_PALETTE0                              327
+#define     A4P4_UNORM_PALETTE0                              328
+#define     R8_SSCALED                                       329
+#define     R8_USCALED                                       330
+#define     P8_UNORM_PALETTE0                                331
+#define     L8_UNORM_SRGB                                    332
+#define     P8_UNORM_PALETTE1                                333
+#define     P4A4_UNORM_PALETTE1                              334
+#define     A4P4_UNORM_PALETTE1                              335
+#define     Y8_UNORM                                         336
+#define     L8_UINT                                          338
+#define     L8_SINT                                          339
+#define     I8_UINT                                          340
+#define     I8_SINT                                          341
+#define     DXT1_RGB_SRGB                                    384
+#define     R1_UNORM                                         385
+#define     YCRCB_NORMAL                                     386
+#define     YCRCB_SWAPUVY                                    387
+#define     P2_UNORM_PALETTE0                                388
+#define     P2_UNORM_PALETTE1                                389
+#define     BC1_UNORM                                        390
+#define     BC2_UNORM                                        391
+#define     BC3_UNORM                                        392
+#define     BC4_UNORM                                        393
+#define     BC5_UNORM                                        394
+#define     BC1_UNORM_SRGB                                   395
+#define     BC2_UNORM_SRGB                                   396
+#define     BC3_UNORM_SRGB                                   397
+#define     MONO8                                            398
+#define     YCRCB_SWAPUV                                     399
+#define     YCRCB_SWAPY                                      400
+#define     DXT1_RGB                                         401
+#define     FXT1                                             402
+#define     R8G8B8_UNORM                                     403
+#define     R8G8B8_SNORM                                     404
+#define     R8G8B8_SSCALED                                   405
+#define     R8G8B8_USCALED                                   406
+#define     R64G64B64A64_FLOAT                               407
+#define     R64G64B64_FLOAT                                  408
+#define     BC4_SNORM                                        409
+#define     BC5_SNORM                                        410
+#define     R16G16B16_FLOAT                                  411
+#define     R16G16B16_UNORM                                  412
+#define     R16G16B16_SNORM                                  413
+#define     R16G16B16_SSCALED                                414
+#define     R16G16B16_USCALED                                415
+#define     BC6H_SF16                                        417
+#define     BC7_UNORM                                        418
+#define     BC7_UNORM_SRGB                                   419
+#define     BC6H_UF16                                        420
+#define     PLANAR_420_8                                     421
+#define     R8G8B8_UNORM_SRGB                                424
+#define     ETC1_RGB8                                        425
+#define     ETC2_RGB8                                        426
+#define     EAC_R11                                          427
+#define     EAC_RG11                                         428
+#define     EAC_SIGNED_R11                                   429
+#define     EAC_SIGNED_RG11                                  430
+#define     ETC2_SRGB8                                       431
+#define     R16G16B16_UINT                                   432
+#define     R16G16B16_SINT                                   433
+#define     R32_SFIXED                                       434
+#define     R10G10B10A2_SNORM                                435
+#define     R10G10B10A2_USCALED                              436
+#define     R10G10B10A2_SSCALED                              437
+#define     R10G10B10A2_SINT                                 438
+#define     B10G10R10A2_SNORM                                439
+#define     B10G10R10A2_USCALED                              440
+#define     B10G10R10A2_SSCALED                              441
+#define     B10G10R10A2_UINT                                 442
+#define     B10G10R10A2_SINT                                 443
+#define     R64G64B64A64_PASSTHRU                            444
+#define     R64G64B64_PASSTHRU                               445
+#define     ETC2_RGB8_PTA                                    448
+#define     ETC2_SRGB8_PTA                                   449
+#define     ETC2_EAC_RGBA8                                   450
+#define     ETC2_EAC_SRGB8_A8                                451
+#define     R8G8B8_UINT                                      456
+#define     R8G8B8_SINT                                      457
+#define     RAW                                              511
+
+/* Enum Texture Coordinate Mode */
+#define     TCM_WRAP                                           0
+#define     TCM_MIRROR                                         1
+#define     TCM_CLAMP                                          2
+#define     TCM_CUBE                                           3
+#define     TCM_CLAMP_BORDER                                   4
+#define     TCM_MIRROR_ONCE                                    5
+
diff --git a/src/vulkan/gen7_cmd_buffer.c b/src/vulkan/gen7_cmd_buffer.c

new file mode 100644 (file)

index 0000000..b83bfda
--- /dev/null
+++ b/src/vulkan/gen7_cmd_buffer.c
@@ -0,0 +1,902 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "gen7_pack.h"
+#include "gen75_pack.h"
+
+static uint32_t
+cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+   static const uint32_t push_constant_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 21,
+      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 22,
+      [MESA_SHADER_FRAGMENT]                    = 23,
+      [MESA_SHADER_COMPUTE]                     = 0,
+   };
+
+   VkShaderStageFlags flushed = 0;
+
+   anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
+      struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
+
+      if (state.offset == 0)
+         continue;
+
+      anv_batch_emit(&cmd_buffer->batch, GEN7_3DSTATE_CONSTANT_VS,
+                     ._3DCommandSubOpcode = push_constant_opcodes[stage],
+                     .ConstantBody = {
+                        .PointerToConstantBuffer0 = { .offset = state.offset },
+                        .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
+                     });
+
+      flushed |= mesa_to_vk_shader_stage(stage);
+   }
+
+   cmd_buffer->state.push_constants_dirty &= ~flushed;
+
+   return flushed;
+}
+
+GENX_FUNC(GEN7, GEN7) void
+genX(cmd_buffer_emit_descriptor_pointers)(struct anv_cmd_buffer *cmd_buffer,
+                                          uint32_t stages)
+{
+   static const uint32_t sampler_state_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 43,
+      [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 46,
+      [MESA_SHADER_FRAGMENT]                    = 47,
+      [MESA_SHADER_COMPUTE]                     = 0,
+   };
+
+   static const uint32_t binding_table_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 38,
+      [MESA_SHADER_TESS_CTRL]                   = 39,
+      [MESA_SHADER_TESS_EVAL]                   = 40,
+      [MESA_SHADER_GEOMETRY]                    = 41,
+      [MESA_SHADER_FRAGMENT]                    = 42,
+      [MESA_SHADER_COMPUTE]                     = 0,
+   };
+
+   anv_foreach_stage(s, stages) {
+      if (cmd_buffer->state.samplers[s].alloc_size > 0) {
+         anv_batch_emit(&cmd_buffer->batch,
+                        GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS,
+                        ._3DCommandSubOpcode  = sampler_state_opcodes[s],
+                        .PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset);
+      }
+
+      /* Always emit binding table pointers if we're asked to, since on SKL
+       * this is what flushes push constants. */
+      anv_batch_emit(&cmd_buffer->batch,
+                     GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS,
+                     ._3DCommandSubOpcode  = binding_table_opcodes[s],
+                     .PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset);
+   }
+}
+
+GENX_FUNC(GEN7, GEN7) uint32_t
+genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer)
+{
+   VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty &
+                              cmd_buffer->state.pipeline->active_stages;
+
+   VkResult result = VK_SUCCESS;
+   anv_foreach_stage(s, dirty) {
+      result = anv_cmd_buffer_emit_samplers(cmd_buffer, s,
+                                            &cmd_buffer->state.samplers[s]);
+      if (result != VK_SUCCESS)
+         break;
+      result = anv_cmd_buffer_emit_binding_table(cmd_buffer, s,
+                                                 &cmd_buffer->state.binding_tables[s]);
+      if (result != VK_SUCCESS)
+         break;
+   }
+
+   if (result != VK_SUCCESS) {
+      assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+      result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+      assert(result == VK_SUCCESS);
+
+      /* Re-emit state base addresses so we get the new surface state base
+       * address before we start emitting binding tables etc.
+       */
+      anv_cmd_buffer_emit_state_base_address(cmd_buffer);
+
+      /* Re-emit all active binding tables */
+      dirty |= cmd_buffer->state.pipeline->active_stages;
+      anv_foreach_stage(s, dirty) {
+         result = anv_cmd_buffer_emit_samplers(cmd_buffer, s,
+                                               &cmd_buffer->state.samplers[s]);
+         if (result != VK_SUCCESS)
+            return result;
+         result = anv_cmd_buffer_emit_binding_table(cmd_buffer, s,
+                                                    &cmd_buffer->state.binding_tables[s]);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   }
+
+   cmd_buffer->state.descriptors_dirty &= ~dirty;
+
+   return dirty;
+}
+
+static inline int64_t
+clamp_int64(int64_t x, int64_t min, int64_t max)
+{
+   if (x < min)
+      return min;
+   else if (x < max)
+      return x;
+   else
+      return max;
+}
+
+static void
+emit_scissor_state(struct anv_cmd_buffer *cmd_buffer,
+                   uint32_t count, const VkRect2D *scissors)
+{
+   struct anv_state scissor_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 32, 32);
+
+   for (uint32_t i = 0; i < count; i++) {
+      const VkRect2D *s = &scissors[i];
+
+      /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
+       * ymax < ymin for empty clips.  In case clip x, y, width height are all
+       * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
+       * what we want. Just special case empty clips and produce a canonical
+       * empty clip. */
+      static const struct GEN7_SCISSOR_RECT empty_scissor = {
+         .ScissorRectangleYMin = 1,
+         .ScissorRectangleXMin = 1,
+         .ScissorRectangleYMax = 0,
+         .ScissorRectangleXMax = 0
+      };
+
+      const int max = 0xffff;
+      struct GEN7_SCISSOR_RECT scissor = {
+         /* Do this math using int64_t so overflow gets clamped correctly. */
+         .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
+         .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
+         .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + s->extent.height - 1, 0, max),
+         .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + s->extent.width - 1, 0, max)
+      };
+
+      if (s->extent.width <= 0 || s->extent.height <= 0) {
+         GEN7_SCISSOR_RECT_pack(NULL, scissor_state.map + i * 32,
+                                &empty_scissor);
+      } else {
+         GEN7_SCISSOR_RECT_pack(NULL, scissor_state.map + i * 32, &scissor);
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_3DSTATE_SCISSOR_STATE_POINTERS,
+                  .ScissorRectPointer = scissor_state.offset);
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(scissor_state);
+}
+
+GENX_FUNC(GEN7, GEN7) void
+genX(cmd_buffer_emit_scissor)(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (cmd_buffer->state.dynamic.scissor.count > 0) {
+      emit_scissor_state(cmd_buffer, cmd_buffer->state.dynamic.scissor.count,
+                         cmd_buffer->state.dynamic.scissor.scissors);
+   } else {
+      /* Emit a default scissor based on the currently bound framebuffer */
+      emit_scissor_state(cmd_buffer, 1,
+                         &(VkRect2D) {
+                            .offset = { .x = 0, .y = 0, },
+                            .extent = {
+                               .width = cmd_buffer->state.framebuffer->width,
+                               .height = cmd_buffer->state.framebuffer->height,
+                            },
+                         });
+   }
+}
+
+static const uint32_t vk_to_gen_index_type[] = {
+   [VK_INDEX_TYPE_UINT16]                       = INDEX_WORD,
+   [VK_INDEX_TYPE_UINT32]                       = INDEX_DWORD,
+};
+
+static const uint32_t restart_index_for_type[] = {
+   [VK_INDEX_TYPE_UINT16]                    = UINT16_MAX,
+   [VK_INDEX_TYPE_UINT32]                    = UINT32_MAX,
+};
+
+void genX(CmdBindIndexBuffer)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkIndexType                                 indexType)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
+   if (ANV_IS_HASWELL)
+      cmd_buffer->state.restart_index = restart_index_for_type[indexType];
+   cmd_buffer->state.gen7.index_buffer = buffer;
+   cmd_buffer->state.gen7.index_type = vk_to_gen_index_type[indexType];
+   cmd_buffer->state.gen7.index_offset = offset;
+}
+
+static VkResult
+flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   struct anv_state surfaces = { 0, }, samplers = { 0, };
+   VkResult result;
+
+   result = anv_cmd_buffer_emit_samplers(cmd_buffer,
+                                         MESA_SHADER_COMPUTE, &samplers);
+   if (result != VK_SUCCESS)
+      return result;
+   result = anv_cmd_buffer_emit_binding_table(cmd_buffer,
+                                              MESA_SHADER_COMPUTE, &surfaces);
+   if (result != VK_SUCCESS)
+      return result;
+
+   const struct brw_cs_prog_data *cs_prog_data = &pipeline->cs_prog_data;
+
+   struct anv_state state =
+      anv_state_pool_emit(&device->dynamic_state_pool,
+                          GEN7_INTERFACE_DESCRIPTOR_DATA, 64,
+                          .KernelStartPointer = pipeline->cs_simd,
+                          .BindingTablePointer = surfaces.offset,
+                          .SamplerStatePointer = samplers.offset,
+                          .BarrierEnable = cs_prog_data->uses_barrier,
+                          .NumberofThreadsinGPGPUThreadGroup =
+                             pipeline->cs_thread_width_max);
+
+   const uint32_t size = GEN7_INTERFACE_DESCRIPTOR_DATA_length * sizeof(uint32_t);
+   anv_batch_emit(&cmd_buffer->batch, GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD,
+                  .InterfaceDescriptorTotalLength = size,
+                  .InterfaceDescriptorDataStartAddress = state.offset);
+
+   return VK_SUCCESS;
+}
+
+static void
+cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   VkResult result;
+
+   assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
+
+   if (cmd_buffer->state.current_pipeline != GPGPU) {
+      anv_batch_emit(&cmd_buffer->batch, GEN7_PIPELINE_SELECT,
+                     .PipelineSelection = GPGPU);
+      cmd_buffer->state.current_pipeline = GPGPU;
+   }
+
+   if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)
+      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
+
+   if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
+       (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
+      /* FIXME: figure out descriptors for gen7 */
+      result = flush_compute_descriptor_set(cmd_buffer);
+      assert(result == VK_SUCCESS);
+      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   cmd_buffer->state.compute_dirty = 0;
+}
+
+static void
+cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   uint32_t *p;
+
+   uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
+
+   assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
+
+   if (cmd_buffer->state.current_pipeline != _3D) {
+      anv_batch_emit(&cmd_buffer->batch, GEN7_PIPELINE_SELECT,
+                     .PipelineSelection = _3D);
+      cmd_buffer->state.current_pipeline = _3D;
+   }
+
+   if (vb_emit) {
+      const uint32_t num_buffers = __builtin_popcount(vb_emit);
+      const uint32_t num_dwords = 1 + num_buffers * 4;
+
+      p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                          GEN7_3DSTATE_VERTEX_BUFFERS);
+      uint32_t vb, i = 0;
+      for_each_bit(vb, vb_emit) {
+         struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
+         uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
+
+         struct GEN7_VERTEX_BUFFER_STATE state = {
+            .VertexBufferIndex = vb,
+            .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
+            .VertexBufferMemoryObjectControlState = GEN7_MOCS,
+            .AddressModifyEnable = true,
+            .BufferPitch = pipeline->binding_stride[vb],
+            .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
+            .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
+            .InstanceDataStepRate = 1
+         };
+
+         GEN7_VERTEX_BUFFER_STATE_pack(&cmd_buffer->batch, &p[1 + i * 4], &state);
+         i++;
+      }
+   }
+
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      /* If somebody compiled a pipeline after starting a command buffer the
+       * scratch bo may have grown since we started this cmd buffer (and
+       * emitted STATE_BASE_ADDRESS).  If we're binding that pipeline now,
+       * reemit STATE_BASE_ADDRESS so that we use the bigger scratch bo. */
+      if (cmd_buffer->state.scratch_size < pipeline->total_scratch)
+         gen7_cmd_buffer_emit_state_base_address(cmd_buffer);
+
+      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
+   }
+
+   if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
+       cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
+      /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
+       *
+       *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
+       *    stall needs to be sent just prior to any 3DSTATE_VS,
+       *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
+       *    3DSTATE_BINDING_TABLE_POINTER_VS,
+       *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
+       *    PIPE_CONTROL needs to be sent before any combination of VS
+       *    associated 3DSTATE."
+       */
+      anv_batch_emit(&cmd_buffer->batch, GEN7_PIPE_CONTROL,
+                     .DepthStallEnable = true,
+                     .PostSyncOperation = WriteImmediateData,
+                     .Address = { &cmd_buffer->device->workaround_bo, 0 });
+   }
+
+   uint32_t dirty = 0;
+   if (cmd_buffer->state.descriptors_dirty) {
+      dirty = gen7_cmd_buffer_flush_descriptor_sets(cmd_buffer);
+      gen7_cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
+   }
+
+   if (cmd_buffer->state.push_constants_dirty)
+      cmd_buffer_flush_push_constants(cmd_buffer);
+
+   /* We use the gen8 state here because it only contains the additional
+    * min/max fields and, since they occur at the end of the packet and
+    * don't change the stride, they work on gen7 too.
+    */
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
+      gen8_cmd_buffer_emit_viewport(cmd_buffer);
+
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
+      gen7_cmd_buffer_emit_scissor(cmd_buffer);
+
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH |
+                                  ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)) {
+
+      bool enable_bias = cmd_buffer->state.dynamic.depth_bias.bias != 0.0f ||
+         cmd_buffer->state.dynamic.depth_bias.slope != 0.0f;
+
+      uint32_t sf_dw[GEN7_3DSTATE_SF_length];
+      struct GEN7_3DSTATE_SF sf = {
+         GEN7_3DSTATE_SF_header,
+         .LineWidth = cmd_buffer->state.dynamic.line_width,
+         .GlobalDepthOffsetEnableSolid = enable_bias,
+         .GlobalDepthOffsetEnableWireframe = enable_bias,
+         .GlobalDepthOffsetEnablePoint = enable_bias,
+         .GlobalDepthOffsetConstant = cmd_buffer->state.dynamic.depth_bias.bias,
+         .GlobalDepthOffsetScale = cmd_buffer->state.dynamic.depth_bias.slope,
+         .GlobalDepthOffsetClamp = cmd_buffer->state.dynamic.depth_bias.clamp
+      };
+      GEN7_3DSTATE_SF_pack(NULL, sf_dw, &sf);
+
+      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gen7.sf);
+   }
+
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            GEN7_COLOR_CALC_STATE_length * 4,
+                                            64);
+      struct GEN7_COLOR_CALC_STATE cc = {
+         .BlendConstantColorRed = cmd_buffer->state.dynamic.blend_constants[0],
+         .BlendConstantColorGreen = cmd_buffer->state.dynamic.blend_constants[1],
+         .BlendConstantColorBlue = cmd_buffer->state.dynamic.blend_constants[2],
+         .BlendConstantColorAlpha = cmd_buffer->state.dynamic.blend_constants[3],
+         .StencilReferenceValue =
+            cmd_buffer->state.dynamic.stencil_reference.front,
+         .BackFaceStencilReferenceValue =
+            cmd_buffer->state.dynamic.stencil_reference.back,
+      };
+      GEN7_COLOR_CALC_STATE_pack(NULL, cc_state.map, &cc);
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(cc_state);
+
+      anv_batch_emit(&cmd_buffer->batch,
+                     GEN7_3DSTATE_CC_STATE_POINTERS,
+                     .ColorCalcStatePointer = cc_state.offset);
+   }
+
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_RENDER_TARGETS |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
+      uint32_t depth_stencil_dw[GEN7_DEPTH_STENCIL_STATE_length];
+
+      const struct anv_image_view *iview =
+         anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
+
+      struct GEN7_DEPTH_STENCIL_STATE depth_stencil = {
+         .StencilBufferWriteEnable = iview && (iview->aspect_mask & VK_IMAGE_ASPECT_STENCIL_BIT),
+
+         .StencilTestMask =
+            cmd_buffer->state.dynamic.stencil_compare_mask.front & 0xff,
+         .StencilWriteMask =
+            cmd_buffer->state.dynamic.stencil_write_mask.front & 0xff,
+
+         .BackfaceStencilTestMask =
+            cmd_buffer->state.dynamic.stencil_compare_mask.back & 0xff,
+         .BackfaceStencilWriteMask =
+            cmd_buffer->state.dynamic.stencil_write_mask.back & 0xff,
+      };
+      GEN7_DEPTH_STENCIL_STATE_pack(NULL, depth_stencil_dw, &depth_stencil);
+
+      struct anv_state ds_state =
+         anv_cmd_buffer_merge_dynamic(cmd_buffer, depth_stencil_dw,
+                                      pipeline->gen7.depth_stencil_state,
+                                      GEN7_DEPTH_STENCIL_STATE_length, 64);
+
+      anv_batch_emit(&cmd_buffer->batch,
+                     GEN7_3DSTATE_DEPTH_STENCIL_STATE_POINTERS,
+                     .PointertoDEPTH_STENCIL_STATE = ds_state.offset);
+   }
+
+   if (cmd_buffer->state.gen7.index_buffer &&
+       cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_INDEX_BUFFER)) {
+      struct anv_buffer *buffer = cmd_buffer->state.gen7.index_buffer;
+      uint32_t offset = cmd_buffer->state.gen7.index_offset;
+
+      if (ANV_IS_HASWELL) {
+         anv_batch_emit(&cmd_buffer->batch, GEN75_3DSTATE_VF,
+                        .IndexedDrawCutIndexEnable = pipeline->primitive_restart,
+                        .CutIndex = cmd_buffer->state.restart_index);
+      }
+
+      anv_batch_emit(&cmd_buffer->batch, GEN7_3DSTATE_INDEX_BUFFER,
+                     .CutIndexEnable = pipeline->primitive_restart,
+                     .IndexFormat = cmd_buffer->state.gen7.index_type,
+                     .MemoryObjectControlState = GEN7_MOCS,
+                     .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
+                     .BufferEndingAddress = { buffer->bo, buffer->offset + buffer->size });
+   }
+
+   cmd_buffer->state.vb_dirty &= ~vb_emit;
+   cmd_buffer->state.dirty = 0;
+}
+
+void genX(CmdDraw)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    vertexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstVertex,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+
+   cmd_buffer_flush_state(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_3DPRIMITIVE,
+                  .VertexAccessType = SEQUENTIAL,
+                  .PrimitiveTopologyType = pipeline->topology,
+                  .VertexCountPerInstance = vertexCount,
+                  .StartVertexLocation = firstVertex,
+                  .InstanceCount = instanceCount,
+                  .StartInstanceLocation = firstInstance,
+                  .BaseVertexLocation = 0);
+}
+
+void genX(CmdDrawIndexed)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    indexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstIndex,
+    int32_t                                     vertexOffset,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+
+   cmd_buffer_flush_state(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_3DPRIMITIVE,
+                  .VertexAccessType = RANDOM,
+                  .PrimitiveTopologyType = pipeline->topology,
+                  .VertexCountPerInstance = indexCount,
+                  .StartVertexLocation = firstIndex,
+                  .InstanceCount = instanceCount,
+                  .StartInstanceLocation = firstInstance,
+                  .BaseVertexLocation = vertexOffset);
+}
+
+static void
+gen7_batch_lrm(struct anv_batch *batch,
+              uint32_t reg, struct anv_bo *bo, uint32_t offset)
+{
+   anv_batch_emit(batch, GEN7_MI_LOAD_REGISTER_MEM,
+                  .RegisterAddress = reg,
+                  .MemoryAddress = { bo, offset });
+}
+
+static void
+gen7_batch_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
+{
+   anv_batch_emit(batch, GEN7_MI_LOAD_REGISTER_IMM,
+                  .RegisterOffset = reg,
+                  .DataDWord = imm);
+}
+
+/* Auto-Draw / Indirect Registers */
+#define GEN7_3DPRIM_END_OFFSET          0x2420
+#define GEN7_3DPRIM_START_VERTEX        0x2430
+#define GEN7_3DPRIM_VERTEX_COUNT        0x2434
+#define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
+#define GEN7_3DPRIM_START_INSTANCE      0x243C
+#define GEN7_3DPRIM_BASE_VERTEX         0x2440
+
+void genX(CmdDrawIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   cmd_buffer_flush_state(cmd_buffer);
+
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
+   gen7_batch_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_3DPRIMITIVE,
+                  .IndirectParameterEnable = true,
+                  .VertexAccessType = SEQUENTIAL,
+                  .PrimitiveTopologyType = pipeline->topology);
+}
+
+void genX(CmdDrawIndexedIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   cmd_buffer_flush_state(cmd_buffer);
+
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
+   gen7_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_3DPRIMITIVE,
+                  .IndirectParameterEnable = true,
+                  .VertexAccessType = RANDOM,
+                  .PrimitiveTopologyType = pipeline->topology);
+}
+
+void genX(CmdDispatch)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    x,
+    uint32_t                                    y,
+    uint32_t                                    z)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+
+   cmd_buffer_flush_compute_state(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_GPGPU_WALKER,
+                  .SIMDSize = prog_data->simd_size / 16,
+                  .ThreadDepthCounterMaximum = 0,
+                  .ThreadHeightCounterMaximum = 0,
+                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
+                  .ThreadGroupIDXDimension = x,
+                  .ThreadGroupIDYDimension = y,
+                  .ThreadGroupIDZDimension = z,
+                  .RightExecutionMask = pipeline->cs_right_mask,
+                  .BottomExecutionMask = 0xffffffff);
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_MEDIA_STATE_FLUSH);
+}
+
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+void genX(CmdDispatchIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   cmd_buffer_flush_compute_state(cmd_buffer);
+
+   gen7_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
+   gen7_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
+   gen7_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_GPGPU_WALKER,
+                  .IndirectParameterEnable = true,
+                  .SIMDSize = prog_data->simd_size / 16,
+                  .ThreadDepthCounterMaximum = 0,
+                  .ThreadHeightCounterMaximum = 0,
+                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
+                  .RightExecutionMask = pipeline->cs_right_mask,
+                  .BottomExecutionMask = 0xffffffff);
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_MEDIA_STATE_FLUSH);
+}
+
+static void
+cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+   const struct anv_image_view *iview =
+      anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
+   const struct anv_image *image = iview ? iview->image : NULL;
+
+   /* XXX: isl needs to grow depth format support */
+   const struct anv_format *anv_format =
+      iview ? anv_format_for_vk_format(iview->vk_format) : NULL;
+
+   const bool has_depth = iview && anv_format->depth_format;
+   const bool has_stencil = iview && anv_format->has_stencil;
+
+   /* Emit 3DSTATE_DEPTH_BUFFER */
+   if (has_depth) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
+         .SurfaceType = SURFTYPE_2D,
+         .DepthWriteEnable = true,
+         .StencilWriteEnable = has_stencil,
+         .HierarchicalDepthBufferEnable = false,
+         .SurfaceFormat = anv_format->depth_format,
+         .SurfacePitch = image->depth_surface.isl.row_pitch - 1,
+         .SurfaceBaseAddress = {
+            .bo = image->bo,
+            .offset = image->depth_surface.offset,
+         },
+         .Height = fb->height - 1,
+         .Width = fb->width - 1,
+         .LOD = 0,
+         .Depth = 1 - 1,
+         .MinimumArrayElement = 0,
+         .DepthBufferObjectControlState = GENX(MOCS),
+         .RenderTargetViewExtent = 1 - 1);
+   } else {
+      /* Even when no depth buffer is present, the hardware requires that
+       * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
+       *
+       *    If a null depth buffer is bound, the driver must instead bind depth as:
+       *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
+       *       3DSTATE_DEPTH.Width = 1
+       *       3DSTATE_DEPTH.Height = 1
+       *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
+       *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
+       *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
+       *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
+       *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
+       *
+       * The PRM is wrong, though. The width and height must be programmed to
+       * actual framebuffer's width and height, even when neither depth buffer
+       * nor stencil buffer is present.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
+         .SurfaceType = SURFTYPE_2D,
+         .SurfaceFormat = D16_UNORM,
+         .Width = fb->width - 1,
+         .Height = fb->height - 1,
+         .StencilWriteEnable = has_stencil);
+   }
+
+   /* Emit 3DSTATE_STENCIL_BUFFER */
+   if (has_stencil) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER),
+#     if (ANV_IS_HASWELL)
+         .StencilBufferEnable = true,
+#     endif
+         .StencilBufferObjectControlState = GENX(MOCS),
+
+         /* Stencil buffers have strange pitch. The PRM says:
+          *
+          *    The pitch must be set to 2x the value computed based on width,
+          *    as the stencil buffer is stored with two rows interleaved.
+          */
+         .SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
+
+         .SurfaceBaseAddress = {
+            .bo = image->bo,
+            .offset = image->offset + image->stencil_surface.offset,
+         });
+   } else {
+      anv_batch_emit(&cmd_buffer->batch, GEN7_3DSTATE_STENCIL_BUFFER);
+   }
+
+   /* Disable hierarchial depth buffers. */
+   anv_batch_emit(&cmd_buffer->batch, GEN7_3DSTATE_HIER_DEPTH_BUFFER);
+
+   /* Clear the clear params. */
+   anv_batch_emit(&cmd_buffer->batch, GEN7_3DSTATE_CLEAR_PARAMS);
+}
+
+/**
+ * @see anv_cmd_buffer_set_subpass()
+ */
+GENX_FUNC(GEN7, GEN7) void
+genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_subpass *subpass)
+{
+   cmd_buffer->state.subpass = subpass;
+   cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
+
+   cmd_buffer_emit_depth_stencil(cmd_buffer);
+}
+
+void genX(CmdBeginRenderPass)(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderPassBeginInfo*                pRenderPassBegin,
+    VkSubpassContents                           contents)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
+   ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
+
+   cmd_buffer->state.framebuffer = framebuffer;
+   cmd_buffer->state.pass = pass;
+   anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
+
+   const VkRect2D *render_area = &pRenderPassBegin->renderArea;
+
+   anv_batch_emit(&cmd_buffer->batch, GEN7_3DSTATE_DRAWING_RECTANGLE,
+                  .ClippedDrawingRectangleYMin = render_area->offset.y,
+                  .ClippedDrawingRectangleXMin = render_area->offset.x,
+                  .ClippedDrawingRectangleYMax =
+                     render_area->offset.y + render_area->extent.height - 1,
+                  .ClippedDrawingRectangleXMax =
+                     render_area->offset.x + render_area->extent.width - 1,
+                  .DrawingRectangleOriginY = 0,
+                  .DrawingRectangleOriginX = 0);
+
+   gen7_cmd_buffer_set_subpass(cmd_buffer, pass->subpasses);
+   anv_cmd_buffer_clear_subpass(cmd_buffer);
+}
+
+void genX(CmdNextSubpass)(
+    VkCommandBuffer                             commandBuffer,
+    VkSubpassContents                           contents)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
+   gen7_cmd_buffer_set_subpass(cmd_buffer, cmd_buffer->state.subpass + 1);
+   anv_cmd_buffer_clear_subpass(cmd_buffer);
+}
+
+void genX(CmdEndRenderPass)(
+    VkCommandBuffer                             commandBuffer)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   /* Emit a flushing pipe control at the end of a pass.  This is kind of a
+    * hack but it ensures that render targets always actually get written.
+    * Eventually, we should do flushing based on image format transitions
+    * or something of that nature.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GEN7_PIPE_CONTROL,
+                  .PostSyncOperation = NoWrite,
+                  .RenderTargetCacheFlushEnable = true,
+                  .InstructionCacheInvalidateEnable = true,
+                  .DepthCacheFlushEnable = true,
+                  .VFCacheInvalidationEnable = true,
+                  .TextureCacheInvalidationEnable = true,
+                  .CommandStreamerStallEnable = true);
+}
+
+void genX(CmdSetEvent)(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     event,
+    VkPipelineStageFlags                        stageMask)
+{
+   stub();
+}
+
+void genX(CmdResetEvent)(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     event,
+    VkPipelineStageFlags                        stageMask)
+{
+   stub();
+}
+
+void genX(CmdWaitEvents)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    eventCount,
+    const VkEvent*                              pEvents,
+    VkPipelineStageFlags                        srcStageMask,
+    VkPipelineStageFlags                        destStageMask,
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+{
+   stub();
+}
diff --git a/src/vulkan/gen7_pack.h b/src/vulkan/gen7_pack.h

new file mode 100644 (file)

index 0000000..a3ba30a
--- /dev/null
+++ b/src/vulkan/gen7_pack.h
@@ -0,0 +1,7003 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+/* Instructions, enums and structures for IVB.
+ *
+ * This file has been generated, do not hand edit.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <assert.h>
+
+#ifndef __gen_validate_value
+#define __gen_validate_value(x)
+#endif
+
+#ifndef __gen_field_functions
+#define __gen_field_functions
+
+union __gen_value {
+   float f;
+   uint32_t dw;
+};
+
+static inline uint64_t
+__gen_mbo(uint32_t start, uint32_t end)
+{
+   return (~0ul >> (64 - (end - start + 1))) << start;
+}
+
+static inline uint64_t
+__gen_field(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   if (end - start + 1 < 64)
+      assert(v < 1ul << (end - start + 1));
+#endif
+
+   return v << start;
+}
+
+static inline uint64_t
+__gen_fixed(float v, uint32_t start, uint32_t end,
+            bool is_signed, uint32_t fract_bits)
+{
+   __gen_validate_value(v);
+
+   const float factor = (1 << fract_bits);
+
+   float max, min;
+   if (is_signed) {
+      max = ((1 << (end - start)) - 1) / factor;
+      min = -(1 << (end - start)) / factor;
+   } else {
+      max = ((1 << (end - start + 1)) - 1) / factor;
+      min = 0.0f;
+   }
+
+   if (v > max)
+      v = max;
+   else if (v < min)
+      v = min;
+
+   int32_t int_val = roundf(v * factor);
+
+   if (is_signed)
+      int_val &= (1 << (end - start + 1)) - 1;
+
+   return int_val << start;
+}
+
+static inline uint64_t
+__gen_offset(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   uint64_t mask = (~0ul >> (64 - (end - start + 1))) << start;
+
+   assert((v & ~mask) == 0);
+#endif
+
+   return v;
+}
+
+static inline uint32_t
+__gen_float(float v)
+{
+   __gen_validate_value(v);
+   return ((union __gen_value) { .f = (v) }).dw;
+}
+
+#ifndef __gen_address_type
+#error #define __gen_address_type before including this file
+#endif
+
+#ifndef __gen_user_data
+#error #define __gen_combine_address before including this file
+#endif
+
+#endif
+
+#define GEN7_3DSTATE_URB_VS_length_bias 0x00000002
+#define GEN7_3DSTATE_URB_VS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 48,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_URB_VS_length 0x00000002
+
+struct GEN7_3DSTATE_URB_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     VSURBStartingAddress;
+   uint32_t                                     VSURBEntryAllocationSize;
+   uint32_t                                     VSNumberofURBEntries;
+};
+
+static inline void
+GEN7_3DSTATE_URB_VS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN7_3DSTATE_URB_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->VSURBStartingAddress, 25, 29) |
+      __gen_field(values->VSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->VSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN7_MI_STORE_REGISTER_MEM_length_bias 0x00000002
+#define GEN7_MI_STORE_REGISTER_MEM_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 36,                  \
+   .DwordLength          =  1
+
+#define GEN7_MI_STORE_REGISTER_MEM_length 0x00000003
+
+struct GEN7_MI_STORE_REGISTER_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN7_MI_STORE_REGISTER_MEM_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN7_MI_STORE_REGISTER_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterAddress, 2, 22) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+}
+
+#define GEN7_PIPELINE_SELECT_length_bias 0x00000001
+#define GEN7_PIPELINE_SELECT_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  1,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4
+
+#define GEN7_PIPELINE_SELECT_length 0x00000001
+
+struct GEN7_PIPELINE_SELECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+#define     _3D                                                0
+#define     Media                                              1
+#define     GPGPU                                              2
+   uint32_t                                     PipelineSelection;
+};
+
+static inline void
+GEN7_PIPELINE_SELECT_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN7_PIPELINE_SELECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->PipelineSelection, 0, 1) |
+      0;
+
+}
+
+#define GEN7_STATE_BASE_ADDRESS_length_bias 0x00000002
+#define GEN7_STATE_BASE_ADDRESS_header          \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  1,                  \
+   .DwordLength          =  8
+
+#define GEN7_STATE_BASE_ADDRESS_length 0x0000000a
+
+#define GEN7_MEMORY_OBJECT_CONTROL_STATE_length 0x00000001
+
+struct GEN7_MEMORY_OBJECT_CONTROL_STATE {
+   uint32_t                                     GraphicsDataTypeGFDT;
+   uint32_t                                     LLCCacheabilityControlLLCCC;
+   uint32_t                                     L3CacheabilityControlL3CC;
+};
+
+static inline void
+GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN7_MEMORY_OBJECT_CONTROL_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->GraphicsDataTypeGFDT, 2, 2) |
+      __gen_field(values->LLCCacheabilityControlLLCCC, 1, 1) |
+      __gen_field(values->L3CacheabilityControlL3CC, 0, 0) |
+      0;
+
+}
+
+struct GEN7_STATE_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GeneralStateBaseAddress;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      GeneralStateMemoryObjectControlState;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      StatelessDataPortAccessMemoryObjectControlState;
+   uint32_t                                     StatelessDataPortAccessForceWriteThru;
+   bool                                         GeneralStateBaseAddressModifyEnable;
+   __gen_address_type                           SurfaceStateBaseAddress;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      SurfaceStateMemoryObjectControlState;
+   bool                                         SurfaceStateBaseAddressModifyEnable;
+   __gen_address_type                           DynamicStateBaseAddress;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      DynamicStateMemoryObjectControlState;
+   bool                                         DynamicStateBaseAddressModifyEnable;
+   __gen_address_type                           IndirectObjectBaseAddress;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      IndirectObjectMemoryObjectControlState;
+   bool                                         IndirectObjectBaseAddressModifyEnable;
+   __gen_address_type                           InstructionBaseAddress;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      InstructionMemoryObjectControlState;
+   bool                                         InstructionBaseAddressModifyEnable;
+   __gen_address_type                           GeneralStateAccessUpperBound;
+   bool                                         GeneralStateAccessUpperBoundModifyEnable;
+   __gen_address_type                           DynamicStateAccessUpperBound;
+   bool                                         DynamicStateAccessUpperBoundModifyEnable;
+   __gen_address_type                           IndirectObjectAccessUpperBound;
+   bool                                         IndirectObjectAccessUpperBoundModifyEnable;
+   __gen_address_type                           InstructionAccessUpperBound;
+   bool                                         InstructionAccessUpperBoundModifyEnable;
+};
+
+static inline void
+GEN7_STATE_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN7_STATE_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_GeneralStateMemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_GeneralStateMemoryObjectControlState, &values->GeneralStateMemoryObjectControlState);
+   uint32_t dw_StatelessDataPortAccessMemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_StatelessDataPortAccessMemoryObjectControlState, &values->StatelessDataPortAccessMemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(dw_GeneralStateMemoryObjectControlState, 8, 11) |
+      __gen_field(dw_StatelessDataPortAccessMemoryObjectControlState, 4, 7) |
+      __gen_field(values->StatelessDataPortAccessForceWriteThru, 3, 3) |
+      __gen_field(values->GeneralStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->GeneralStateBaseAddress, dw1);
+
+   uint32_t dw_SurfaceStateMemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceStateMemoryObjectControlState, &values->SurfaceStateMemoryObjectControlState);
+   uint32_t dw2 =
+      __gen_field(dw_SurfaceStateMemoryObjectControlState, 8, 11) |
+      __gen_field(values->SurfaceStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceStateBaseAddress, dw2);
+
+   uint32_t dw_DynamicStateMemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_DynamicStateMemoryObjectControlState, &values->DynamicStateMemoryObjectControlState);
+   uint32_t dw3 =
+      __gen_field(dw_DynamicStateMemoryObjectControlState, 8, 11) |
+      __gen_field(values->DynamicStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->DynamicStateBaseAddress, dw3);
+
+   uint32_t dw_IndirectObjectMemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_IndirectObjectMemoryObjectControlState, &values->IndirectObjectMemoryObjectControlState);
+   uint32_t dw4 =
+      __gen_field(dw_IndirectObjectMemoryObjectControlState, 8, 11) |
+      __gen_field(values->IndirectObjectBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[4] =
+      __gen_combine_address(data, &dw[4], values->IndirectObjectBaseAddress, dw4);
+
+   uint32_t dw_InstructionMemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_InstructionMemoryObjectControlState, &values->InstructionMemoryObjectControlState);
+   uint32_t dw5 =
+      __gen_field(dw_InstructionMemoryObjectControlState, 8, 11) |
+      __gen_field(values->InstructionBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   dw[5] =
+      __gen_combine_address(data, &dw[5], values->InstructionBaseAddress, dw5);
+
+   uint32_t dw6 =
+      __gen_field(values->GeneralStateAccessUpperBoundModifyEnable, 0, 0) |
+      0;
+
+   dw[6] =
+      __gen_combine_address(data, &dw[6], values->GeneralStateAccessUpperBound, dw6);
+
+   uint32_t dw7 =
+      __gen_field(values->DynamicStateAccessUpperBoundModifyEnable, 0, 0) |
+      0;
+
+   dw[7] =
+      __gen_combine_address(data, &dw[7], values->DynamicStateAccessUpperBound, dw7);
+
+   uint32_t dw8 =
+      __gen_field(values->IndirectObjectAccessUpperBoundModifyEnable, 0, 0) |
+      0;
+
+   dw[8] =
+      __gen_combine_address(data, &dw[8], values->IndirectObjectAccessUpperBound, dw8);
+
+   uint32_t dw9 =
+      __gen_field(values->InstructionAccessUpperBoundModifyEnable, 0, 0) |
+      0;
+
+   dw[9] =
+      __gen_combine_address(data, &dw[9], values->InstructionAccessUpperBound, dw9);
+
+}
+
+#define GEN7_STATE_PREFETCH_length_bias 0x00000002
+#define GEN7_STATE_PREFETCH_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  3,                  \
+   .DwordLength          =  0
+
+#define GEN7_STATE_PREFETCH_length 0x00000002
+
+struct GEN7_STATE_PREFETCH {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           PrefetchPointer;
+   uint32_t                                     PrefetchCount;
+};
+
+static inline void
+GEN7_STATE_PREFETCH_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN7_STATE_PREFETCH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->PrefetchCount, 0, 2) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->PrefetchPointer, dw1);
+
+}
+
+#define GEN7_STATE_SIP_length_bias 0x00000002
+#define GEN7_STATE_SIP_header                   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  2,                  \
+   .DwordLength          =  0
+
+#define GEN7_STATE_SIP_length 0x00000002
+
+struct GEN7_STATE_SIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SystemInstructionPointer;
+};
+
+static inline void
+GEN7_STATE_SIP_pack(__gen_user_data *data, void * restrict dst,
+                    const struct GEN7_STATE_SIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SystemInstructionPointer, 4, 31) |
+      0;
+
+}
+
+#define GEN7_SWTESS_BASE_ADDRESS_length_bias 0x00000002
+#define GEN7_SWTESS_BASE_ADDRESS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  3,                  \
+   .DwordLength          =  0
+
+#define GEN7_SWTESS_BASE_ADDRESS_length 0x00000002
+
+struct GEN7_SWTESS_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           SWTessellationBaseAddress;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      SWTessellationMemoryObjectControlState;
+};
+
+static inline void
+GEN7_SWTESS_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_SWTESS_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SWTessellationMemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SWTessellationMemoryObjectControlState, &values->SWTessellationMemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(dw_SWTessellationMemoryObjectControlState, 8, 11) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->SWTessellationBaseAddress, dw1);
+
+}
+
+#define GEN7_3DPRIMITIVE_length_bias 0x00000002
+#define GEN7_3DPRIMITIVE_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  3,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DPRIMITIVE_length 0x00000007
+
+struct GEN7_3DPRIMITIVE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         IndirectParameterEnable;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   bool                                         EndOffsetEnable;
+#define     SEQUENTIAL                                         0
+#define     RANDOM                                             1
+   uint32_t                                     VertexAccessType;
+   uint32_t                                     PrimitiveTopologyType;
+   uint32_t                                     VertexCountPerInstance;
+   uint32_t                                     StartVertexLocation;
+   uint32_t                                     InstanceCount;
+   uint32_t                                     StartInstanceLocation;
+   uint32_t                                     BaseVertexLocation;
+};
+
+static inline void
+GEN7_3DPRIMITIVE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN7_3DPRIMITIVE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->IndirectParameterEnable, 10, 10) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->EndOffsetEnable, 9, 9) |
+      __gen_field(values->VertexAccessType, 8, 8) |
+      __gen_field(values->PrimitiveTopologyType, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->VertexCountPerInstance, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->StartVertexLocation, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->InstanceCount, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->StartInstanceLocation, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->BaseVertexLocation, 0, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_AA_LINE_PARAMETERS_length_bias 0x00000002
+#define GEN7_3DSTATE_AA_LINE_PARAMETERS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 10,                  \
+   .DwordLength          =  1
+
+#define GEN7_3DSTATE_AA_LINE_PARAMETERS_length 0x00000003
+
+struct GEN7_3DSTATE_AA_LINE_PARAMETERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        AACoverageBias;
+   float                                        AACoverageSlope;
+   float                                        AACoverageEndCapBias;
+   float                                        AACoverageEndCapSlope;
+};
+
+static inline void
+GEN7_3DSTATE_AA_LINE_PARAMETERS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN7_3DSTATE_AA_LINE_PARAMETERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AACoverageBias * (1 << 8), 16, 23) |
+      __gen_field(values->AACoverageSlope * (1 << 8), 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->AACoverageEndCapBias * (1 << 8), 16, 23) |
+      __gen_field(values->AACoverageEndCapSlope * (1 << 8), 0, 7) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS_length_bias 0x00000002
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 40,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS_length 0x00000002
+
+struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDSBindingTable;
+};
+
+static inline void
+GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS_length_bias 0x00000002
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 41,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS_length 0x00000002
+
+struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoGSBindingTable;
+};
+
+static inline void
+GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoGSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS_length_bias 0x00000002
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 39,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS_length 0x00000002
+
+struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoHSBindingTable;
+};
+
+static inline void
+GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoHSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS_length_bias 0x00000002
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 42,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS_length 0x00000002
+
+struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoPSBindingTable;
+};
+
+static inline void
+GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoPSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS_length_bias 0x00000002
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 38,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS_length 0x00000002
+
+struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoVSBindingTable;
+};
+
+static inline void
+GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoVSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_BLEND_STATE_POINTERS_length_bias 0x00000002
+#define GEN7_3DSTATE_BLEND_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 36,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_BLEND_STATE_POINTERS_length 0x00000002
+
+struct GEN7_3DSTATE_BLEND_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BlendStatePointer;
+};
+
+static inline void
+GEN7_3DSTATE_BLEND_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN7_3DSTATE_BLEND_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->BlendStatePointer, 6, 31) |
+      __gen_mbo(0, 0) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_CC_STATE_POINTERS_length_bias 0x00000002
+#define GEN7_3DSTATE_CC_STATE_POINTERS_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 14,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_CC_STATE_POINTERS_length 0x00000002
+
+struct GEN7_3DSTATE_CC_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ColorCalcStatePointer;
+};
+
+static inline void
+GEN7_3DSTATE_CC_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN7_3DSTATE_CC_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ColorCalcStatePointer, 6, 31) |
+      __gen_mbo(0, 0) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_CHROMA_KEY_length_bias 0x00000002
+#define GEN7_3DSTATE_CHROMA_KEY_header          \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  2
+
+#define GEN7_3DSTATE_CHROMA_KEY_length 0x00000004
+
+struct GEN7_3DSTATE_CHROMA_KEY {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ChromaKeyTableIndex;
+   uint32_t                                     ChromaKeyLowValue;
+   uint32_t                                     ChromaKeyHighValue;
+};
+
+static inline void
+GEN7_3DSTATE_CHROMA_KEY_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN7_3DSTATE_CHROMA_KEY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ChromaKeyTableIndex, 30, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChromaKeyLowValue, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ChromaKeyHighValue, 0, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_CLEAR_PARAMS_length_bias 0x00000002
+#define GEN7_3DSTATE_CLEAR_PARAMS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  1
+
+#define GEN7_3DSTATE_CLEAR_PARAMS_length 0x00000003
+
+struct GEN7_3DSTATE_CLEAR_PARAMS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     DepthClearValue;
+   bool                                         DepthClearValueValid;
+};
+
+static inline void
+GEN7_3DSTATE_CLEAR_PARAMS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_3DSTATE_CLEAR_PARAMS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DepthClearValue, 0, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DepthClearValueValid, 0, 0) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_CLIP_length_bias 0x00000002
+#define GEN7_3DSTATE_CLIP_header                \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 18,                  \
+   .DwordLength          =  2
+
+#define GEN7_3DSTATE_CLIP_length 0x00000004
+
+struct GEN7_3DSTATE_CLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     FrontWinding;
+   uint32_t                                     VertexSubPixelPrecisionSelect;
+   bool                                         EarlyCullEnable;
+#define     CULLMODE_BOTH                                      0
+#define     CULLMODE_NONE                                      1
+#define     CULLMODE_FRONT                                     2
+#define     CULLMODE_BACK                                      3
+   uint32_t                                     CullMode;
+   bool                                         ClipperStatisticsEnable;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+   bool                                         ClipEnable;
+#define     APIMODE_OGL                                        0
+   uint32_t                                     APIMode;
+   bool                                         ViewportXYClipTestEnable;
+   bool                                         ViewportZClipTestEnable;
+   bool                                         GuardbandClipTestEnable;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+#define     CLIPMODE_NORMAL                                    0
+#define     CLIPMODE_REJECT_ALL                                3
+#define     CLIPMODE_ACCEPT_ALL                                4
+   uint32_t                                     ClipMode;
+   bool                                         PerspectiveDivideDisable;
+   bool                                         NonPerspectiveBarycentricEnable;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+#define     Vertex2                                            2
+   uint32_t                                     TriangleStripListProvokingVertexSelect;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+   uint32_t                                     LineStripListProvokingVertexSelect;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+#define     Vertex2                                            2
+   uint32_t                                     TriangleFanProvokingVertexSelect;
+   float                                        MinimumPointWidth;
+   float                                        MaximumPointWidth;
+   bool                                         ForceZeroRTAIndexEnable;
+   uint32_t                                     MaximumVPIndex;
+};
+
+static inline void
+GEN7_3DSTATE_CLIP_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_3DSTATE_CLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->FrontWinding, 20, 20) |
+      __gen_field(values->VertexSubPixelPrecisionSelect, 19, 19) |
+      __gen_field(values->EarlyCullEnable, 18, 18) |
+      __gen_field(values->CullMode, 16, 17) |
+      __gen_field(values->ClipperStatisticsEnable, 10, 10) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClipEnable, 31, 31) |
+      __gen_field(values->APIMode, 30, 30) |
+      __gen_field(values->ViewportXYClipTestEnable, 28, 28) |
+      __gen_field(values->ViewportZClipTestEnable, 27, 27) |
+      __gen_field(values->GuardbandClipTestEnable, 26, 26) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 16, 23) |
+      __gen_field(values->ClipMode, 13, 15) |
+      __gen_field(values->PerspectiveDivideDisable, 9, 9) |
+      __gen_field(values->NonPerspectiveBarycentricEnable, 8, 8) |
+      __gen_field(values->TriangleStripListProvokingVertexSelect, 4, 5) |
+      __gen_field(values->LineStripListProvokingVertexSelect, 2, 3) |
+      __gen_field(values->TriangleFanProvokingVertexSelect, 0, 1) |
+      0;
+
+   dw[3] =
+      __gen_field(values->MinimumPointWidth * (1 << 3), 17, 27) |
+      __gen_field(values->MaximumPointWidth * (1 << 3), 6, 16) |
+      __gen_field(values->ForceZeroRTAIndexEnable, 5, 5) |
+      __gen_field(values->MaximumVPIndex, 0, 3) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_CONSTANT_DS_length_bias 0x00000002
+#define GEN7_3DSTATE_CONSTANT_DS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 26,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_CONSTANT_DS_length 0x00000007
+
+#define GEN7_3DSTATE_CONSTANT_BODY_length 0x00000006
+
+struct GEN7_3DSTATE_CONSTANT_BODY {
+   uint32_t                                     ConstantBuffer1ReadLength;
+   uint32_t                                     ConstantBuffer0ReadLength;
+   uint32_t                                     ConstantBuffer3ReadLength;
+   uint32_t                                     ConstantBuffer2ReadLength;
+   __gen_address_type                           PointerToConstantBuffer0;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   __gen_address_type                           PointerToConstantBuffer1;
+   __gen_address_type                           PointerToConstantBuffer2;
+   __gen_address_type                           PointerToConstantBuffer3;
+};
+
+static inline void
+GEN7_3DSTATE_CONSTANT_BODY_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN7_3DSTATE_CONSTANT_BODY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ConstantBuffer1ReadLength, 16, 31) |
+      __gen_field(values->ConstantBuffer0ReadLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBuffer3ReadLength, 16, 31) |
+      __gen_field(values->ConstantBuffer2ReadLength, 0, 15) |
+      0;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   uint32_t dw2 =
+      __gen_field(dw_ConstantBufferObjectControlState, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->PointerToConstantBuffer0, dw2);
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->PointerToConstantBuffer1, dw3);
+
+   uint32_t dw4 =
+      0;
+
+   dw[4] =
+      __gen_combine_address(data, &dw[4], values->PointerToConstantBuffer2, dw4);
+
+   uint32_t dw5 =
+      0;
+
+   dw[5] =
+      __gen_combine_address(data, &dw[5], values->PointerToConstantBuffer3, dw5);
+
+}
+
+struct GEN7_3DSTATE_CONSTANT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN7_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN7_3DSTATE_CONSTANT_DS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_3DSTATE_CONSTANT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN7_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN7_3DSTATE_CONSTANT_GS_length_bias 0x00000002
+#define GEN7_3DSTATE_CONSTANT_GS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 22,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_CONSTANT_GS_length 0x00000007
+
+struct GEN7_3DSTATE_CONSTANT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN7_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN7_3DSTATE_CONSTANT_GS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_3DSTATE_CONSTANT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN7_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN7_3DSTATE_CONSTANT_HS_length_bias 0x00000002
+#define GEN7_3DSTATE_CONSTANT_HS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 25,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_CONSTANT_HS_length 0x00000007
+
+struct GEN7_3DSTATE_CONSTANT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN7_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN7_3DSTATE_CONSTANT_HS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_3DSTATE_CONSTANT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN7_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN7_3DSTATE_CONSTANT_PS_length_bias 0x00000002
+#define GEN7_3DSTATE_CONSTANT_PS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 23,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_CONSTANT_PS_length 0x00000007
+
+struct GEN7_3DSTATE_CONSTANT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN7_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN7_3DSTATE_CONSTANT_PS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_3DSTATE_CONSTANT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN7_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN7_3DSTATE_CONSTANT_VS_length_bias 0x00000002
+#define GEN7_3DSTATE_CONSTANT_VS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 21,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_CONSTANT_VS_length 0x00000007
+
+struct GEN7_3DSTATE_CONSTANT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN7_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN7_3DSTATE_CONSTANT_VS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_3DSTATE_CONSTANT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN7_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN7_3DSTATE_DEPTH_BUFFER_length_bias 0x00000002
+#define GEN7_3DSTATE_DEPTH_BUFFER_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  5,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_DEPTH_BUFFER_length 0x00000007
+
+struct GEN7_3DSTATE_DEPTH_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     SURFTYPE_1D                                        0
+#define     SURFTYPE_2D                                        1
+#define     SURFTYPE_3D                                        2
+#define     SURFTYPE_CUBE                                      3
+#define     SURFTYPE_NULL                                      7
+   uint32_t                                     SurfaceType;
+   bool                                         DepthWriteEnable;
+   bool                                         StencilWriteEnable;
+   bool                                         HierarchicalDepthBufferEnable;
+#define     D32_FLOAT                                          1
+#define     D24_UNORM_X8_UINT                                  3
+#define     D16_UNORM                                          5
+   uint32_t                                     SurfaceFormat;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     Height;
+   uint32_t                                     Width;
+   uint32_t                                     LOD;
+#define     SURFTYPE_CUBEmustbezero                            0
+   uint32_t                                     Depth;
+   uint32_t                                     MinimumArrayElement;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      DepthBufferObjectControlState;
+   uint32_t                                     DepthCoordinateOffsetY;
+   uint32_t                                     DepthCoordinateOffsetX;
+   uint32_t                                     RenderTargetViewExtent;
+};
+
+static inline void
+GEN7_3DSTATE_DEPTH_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_3DSTATE_DEPTH_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SurfaceType, 29, 31) |
+      __gen_field(values->DepthWriteEnable, 28, 28) |
+      __gen_field(values->StencilWriteEnable, 27, 27) |
+      __gen_field(values->HierarchicalDepthBufferEnable, 22, 22) |
+      __gen_field(values->SurfaceFormat, 18, 20) |
+      __gen_field(values->SurfacePitch, 0, 17) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[3] =
+      __gen_field(values->Height, 18, 31) |
+      __gen_field(values->Width, 4, 17) |
+      __gen_field(values->LOD, 0, 3) |
+      0;
+
+   uint32_t dw_DepthBufferObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_DepthBufferObjectControlState, &values->DepthBufferObjectControlState);
+   dw[4] =
+      __gen_field(values->Depth, 21, 31) |
+      __gen_field(values->MinimumArrayElement, 10, 20) |
+      __gen_field(dw_DepthBufferObjectControlState, 0, 3) |
+      0;
+
+   dw[5] =
+      __gen_field(values->DepthCoordinateOffsetY, 16, 31) |
+      __gen_field(values->DepthCoordinateOffsetX, 0, 15) |
+      0;
+
+   dw[6] =
+      __gen_field(values->RenderTargetViewExtent, 21, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_DEPTH_STENCIL_STATE_POINTERS_length_bias 0x00000002
+#define GEN7_3DSTATE_DEPTH_STENCIL_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 37,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_DEPTH_STENCIL_STATE_POINTERS_length 0x00000002
+
+struct GEN7_3DSTATE_DEPTH_STENCIL_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDEPTH_STENCIL_STATE;
+};
+
+static inline void
+GEN7_3DSTATE_DEPTH_STENCIL_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                               const struct GEN7_3DSTATE_DEPTH_STENCIL_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDEPTH_STENCIL_STATE, 6, 31) |
+      __gen_mbo(0, 0) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_DRAWING_RECTANGLE_length_bias 0x00000002
+#define GEN7_3DSTATE_DRAWING_RECTANGLE_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  2
+
+#define GEN7_3DSTATE_DRAWING_RECTANGLE_length 0x00000004
+
+struct GEN7_3DSTATE_DRAWING_RECTANGLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ClippedDrawingRectangleYMin;
+   uint32_t                                     ClippedDrawingRectangleXMin;
+   uint32_t                                     ClippedDrawingRectangleYMax;
+   uint32_t                                     ClippedDrawingRectangleXMax;
+   uint32_t                                     DrawingRectangleOriginY;
+   uint32_t                                     DrawingRectangleOriginX;
+};
+
+static inline void
+GEN7_3DSTATE_DRAWING_RECTANGLE_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN7_3DSTATE_DRAWING_RECTANGLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ClippedDrawingRectangleYMin, 16, 31) |
+      __gen_field(values->ClippedDrawingRectangleXMin, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClippedDrawingRectangleYMax, 16, 31) |
+      __gen_field(values->ClippedDrawingRectangleXMax, 0, 15) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DrawingRectangleOriginY, 16, 31) |
+      __gen_field(values->DrawingRectangleOriginX, 0, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_DS_length_bias 0x00000002
+#define GEN7_3DSTATE_DS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 29,                  \
+   .DwordLength          =  4
+
+#define GEN7_3DSTATE_DS_length 0x00000006
+
+struct GEN7_3DSTATE_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleDomainPointDispatch;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     PatchURBEntryReadLength;
+   uint32_t                                     PatchURBEntryReadOffset;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         StatisticsEnable;
+   bool                                         ComputeWCoordinateEnable;
+   bool                                         DSCacheDisable;
+   bool                                         DSFunctionEnable;
+};
+
+static inline void
+GEN7_3DSTATE_DS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_3DSTATE_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SingleDomainPointDispatch, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 20, 24) |
+      __gen_field(values->PatchURBEntryReadLength, 11, 17) |
+      __gen_field(values->PatchURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[5] =
+      __gen_field(values->MaximumNumberofThreads, 25, 31) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->ComputeWCoordinateEnable, 2, 2) |
+      __gen_field(values->DSCacheDisable, 1, 1) |
+      __gen_field(values->DSFunctionEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_GS_length_bias 0x00000002
+#define GEN7_3DSTATE_GS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 17,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_GS_length 0x00000007
+
+struct GEN7_3DSTATE_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     KernelStartPointer;
+   uint32_t                                     SingleProgramFlowSPF;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnableVME;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     NormalPriority                                     0
+#define     HighPriority                                       1
+   uint32_t                                     ThreadPriority;
+#define     IEEE754                                            0
+#define     alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     OutputVertexSize;
+   uint32_t                                     OutputTopology;
+   uint32_t                                     VertexURBEntryReadLength;
+   bool                                         IncludeVertexHandles;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     DispatchGRFStartRegisterforURBData;
+   uint32_t                                     MaximumNumberofThreads;
+#define     GSCTL_CUT                                          0
+#define     GSCTL_SID                                          1
+   uint32_t                                     ControlDataFormat;
+   uint32_t                                     ControlDataHeaderSize;
+   uint32_t                                     InstanceControl;
+   uint32_t                                     DefaultStreamID;
+#define     SINGLE                                             0
+#define     DUAL_INSTANCE                                      1
+#define     DUAL_OBJECT                                        2
+   uint32_t                                     DispatchMode;
+   uint32_t                                     GSStatisticsEnable;
+   uint32_t                                     GSInvocationsIncrementValue;
+   bool                                         IncludePrimitiveID;
+   uint32_t                                     Hint;
+   bool                                         ReorderEnable;
+   bool                                         DiscardAdjacency;
+   bool                                         GSEnable;
+   uint32_t                                     SemaphoreHandle;
+};
+
+static inline void
+GEN7_3DSTATE_GS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_3DSTATE_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SingleProgramFlowSPF, 31, 31) |
+      __gen_field(values->VectorMaskEnableVME, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->OutputVertexSize, 23, 28) |
+      __gen_field(values->OutputTopology, 17, 22) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->IncludeVertexHandles, 10, 10) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      __gen_field(values->DispatchGRFStartRegisterforURBData, 0, 3) |
+      0;
+
+   dw[5] =
+      __gen_field(values->MaximumNumberofThreads, 25, 31) |
+      __gen_field(values->ControlDataFormat, 24, 24) |
+      __gen_field(values->ControlDataHeaderSize, 20, 23) |
+      __gen_field(values->InstanceControl, 15, 19) |
+      __gen_field(values->DefaultStreamID, 13, 14) |
+      __gen_field(values->DispatchMode, 11, 12) |
+      __gen_field(values->GSStatisticsEnable, 10, 10) |
+      __gen_field(values->GSInvocationsIncrementValue, 5, 9) |
+      __gen_field(values->IncludePrimitiveID, 4, 4) |
+      __gen_field(values->Hint, 3, 3) |
+      __gen_field(values->ReorderEnable, 2, 2) |
+      __gen_field(values->DiscardAdjacency, 1, 1) |
+      __gen_field(values->GSEnable, 0, 0) |
+      0;
+
+   dw[6] =
+      __gen_offset(values->SemaphoreHandle, 0, 11) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_HIER_DEPTH_BUFFER_length_bias 0x00000002
+#define GEN7_3DSTATE_HIER_DEPTH_BUFFER_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  7,                  \
+   .DwordLength          =  1
+
+#define GEN7_3DSTATE_HIER_DEPTH_BUFFER_length 0x00000003
+
+struct GEN7_3DSTATE_HIER_DEPTH_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      HierarchicalDepthBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+};
+
+static inline void
+GEN7_3DSTATE_HIER_DEPTH_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN7_3DSTATE_HIER_DEPTH_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_HierarchicalDepthBufferObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_HierarchicalDepthBufferObjectControlState, &values->HierarchicalDepthBufferObjectControlState);
+   dw[1] =
+      __gen_field(dw_HierarchicalDepthBufferObjectControlState, 25, 28) |
+      __gen_field(values->SurfacePitch, 0, 16) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+}
+
+#define GEN7_3DSTATE_HS_length_bias 0x00000002
+#define GEN7_3DSTATE_HS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 27,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_HS_length 0x00000007
+
+struct GEN7_3DSTATE_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     IEEE754                                            0
+#define     alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         Enable;
+   bool                                         StatisticsEnable;
+   uint32_t                                     InstanceCount;
+   uint32_t                                     KernelStartPointer;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     SingleProgramFlow;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+   bool                                         IncludeVertexHandles;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     SemaphoreHandle;
+};
+
+static inline void
+GEN7_3DSTATE_HS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_3DSTATE_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      __gen_field(values->MaximumNumberofThreads, 0, 6) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Enable, 31, 31) |
+      __gen_field(values->StatisticsEnable, 29, 29) |
+      __gen_field(values->InstanceCount, 0, 3) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[4] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[5] =
+      __gen_field(values->SingleProgramFlow, 27, 27) |
+      __gen_field(values->VectorMaskEnable, 26, 26) |
+      __gen_field(values->IncludeVertexHandles, 24, 24) |
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 19, 23) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[6] =
+      __gen_offset(values->SemaphoreHandle, 0, 11) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_INDEX_BUFFER_length_bias 0x00000002
+#define GEN7_3DSTATE_INDEX_BUFFER_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 10,                  \
+   .DwordLength          =  1
+
+#define GEN7_3DSTATE_INDEX_BUFFER_length 0x00000003
+
+struct GEN7_3DSTATE_INDEX_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   bool                                         CutIndexEnable;
+#define     INDEX_BYTE                                         0
+#define     INDEX_WORD                                         1
+#define     INDEX_DWORD                                        2
+   uint32_t                                     IndexFormat;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BufferStartingAddress;
+   __gen_address_type                           BufferEndingAddress;
+};
+
+static inline void
+GEN7_3DSTATE_INDEX_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_3DSTATE_INDEX_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_MemoryObjectControlState, 12, 15) |
+      __gen_field(values->CutIndexEnable, 10, 10) |
+      __gen_field(values->IndexFormat, 8, 9) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->BufferStartingAddress, dw1);
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->BufferEndingAddress, dw2);
+
+}
+
+#define GEN7_3DSTATE_LINE_STIPPLE_length_bias 0x00000002
+#define GEN7_3DSTATE_LINE_STIPPLE_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  8,                  \
+   .DwordLength          =  1
+
+#define GEN7_3DSTATE_LINE_STIPPLE_length 0x00000003
+
+struct GEN7_3DSTATE_LINE_STIPPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ModifyEnableCurrentRepeatCounterCurrentStippleIndex;
+   uint32_t                                     CurrentRepeatCounter;
+   uint32_t                                     CurrentStippleIndex;
+   uint32_t                                     LineStipplePattern;
+   float                                        LineStippleInverseRepeatCount;
+   uint32_t                                     LineStippleRepeatCount;
+};
+
+static inline void
+GEN7_3DSTATE_LINE_STIPPLE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_3DSTATE_LINE_STIPPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ModifyEnableCurrentRepeatCounterCurrentStippleIndex, 31, 31) |
+      __gen_field(values->CurrentRepeatCounter, 21, 29) |
+      __gen_field(values->CurrentStippleIndex, 16, 19) |
+      __gen_field(values->LineStipplePattern, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->LineStippleInverseRepeatCount * (1 << 16), 15, 31) |
+      __gen_field(values->LineStippleRepeatCount, 0, 8) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_MONOFILTER_SIZE_length_bias 0x00000002
+#define GEN7_3DSTATE_MONOFILTER_SIZE_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 17,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_MONOFILTER_SIZE_length 0x00000002
+
+struct GEN7_3DSTATE_MONOFILTER_SIZE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     MonochromeFilterWidth;
+   uint32_t                                     MonochromeFilterHeight;
+};
+
+static inline void
+GEN7_3DSTATE_MONOFILTER_SIZE_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN7_3DSTATE_MONOFILTER_SIZE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MonochromeFilterWidth, 3, 5) |
+      __gen_field(values->MonochromeFilterHeight, 0, 2) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_MULTISAMPLE_length_bias 0x00000002
+#define GEN7_3DSTATE_MULTISAMPLE_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 13,                  \
+   .DwordLength          =  2
+
+#define GEN7_3DSTATE_MULTISAMPLE_length 0x00000004
+
+struct GEN7_3DSTATE_MULTISAMPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     PIXLOC_CENTER                                      0
+#define     PIXLOC_UL_CORNER                                   1
+   uint32_t                                     PixelLocation;
+#define     NUMSAMPLES_1                                       0
+#define     NUMSAMPLES_4                                       2
+#define     NUMSAMPLES_8                                       3
+   uint32_t                                     NumberofMultisamples;
+   float                                        Sample3XOffset;
+   float                                        Sample3YOffset;
+   float                                        Sample2XOffset;
+   float                                        Sample2YOffset;
+   float                                        Sample1XOffset;
+   float                                        Sample1YOffset;
+   float                                        Sample0XOffset;
+   float                                        Sample0YOffset;
+   float                                        Sample7XOffset;
+   float                                        Sample7YOffset;
+   float                                        Sample6XOffset;
+   float                                        Sample6YOffset;
+   float                                        Sample5XOffset;
+   float                                        Sample5YOffset;
+   float                                        Sample4XOffset;
+   float                                        Sample4YOffset;
+};
+
+static inline void
+GEN7_3DSTATE_MULTISAMPLE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_3DSTATE_MULTISAMPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PixelLocation, 4, 4) |
+      __gen_field(values->NumberofMultisamples, 1, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Sample3XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->Sample3YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->Sample2XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->Sample2YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->Sample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->Sample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->Sample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->Sample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Sample7XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->Sample7YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->Sample6XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->Sample6YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->Sample5XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->Sample5YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->Sample4XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->Sample4YOffset * (1 << 4), 0, 3) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_POLY_STIPPLE_OFFSET_length_bias 0x00000002
+#define GEN7_3DSTATE_POLY_STIPPLE_OFFSET_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  6,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_POLY_STIPPLE_OFFSET_length 0x00000002
+
+struct GEN7_3DSTATE_POLY_STIPPLE_OFFSET {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PolygonStippleXOffset;
+   uint32_t                                     PolygonStippleYOffset;
+};
+
+static inline void
+GEN7_3DSTATE_POLY_STIPPLE_OFFSET_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN7_3DSTATE_POLY_STIPPLE_OFFSET * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PolygonStippleXOffset, 8, 12) |
+      __gen_field(values->PolygonStippleYOffset, 0, 4) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_POLY_STIPPLE_PATTERN_length_bias 0x00000002
+#define GEN7_3DSTATE_POLY_STIPPLE_PATTERN_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  7,                  \
+   .DwordLength          = 31
+
+#define GEN7_3DSTATE_POLY_STIPPLE_PATTERN_length 0x00000021
+
+struct GEN7_3DSTATE_POLY_STIPPLE_PATTERN {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PatternRow[32];
+};
+
+static inline void
+GEN7_3DSTATE_POLY_STIPPLE_PATTERN_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN7_3DSTATE_POLY_STIPPLE_PATTERN * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 32; i += 1, j++) {
+      dw[j] =
+         __gen_field(values->PatternRow[i + 0], 0, 31) |
+         0;
+   }
+
+}
+
+#define GEN7_3DSTATE_PS_length_bias 0x00000002
+#define GEN7_3DSTATE_PS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 32,                  \
+   .DwordLength          =  6
+
+#define GEN7_3DSTATE_PS_length 0x00000008
+
+struct GEN7_3DSTATE_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     KernelStartPointer0;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleProgramFlowSPF;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnableVME;
+   uint32_t                                     SamplerCount;
+#define     FTZ                                                0
+#define     RET                                                1
+   uint32_t                                     DenormalMode;
+   uint32_t                                     BindingTableEntryCount;
+#define     IEEE745                                            0
+#define     Alt                                                1
+   uint32_t                                     FloatingPointMode;
+#define     RTNE                                               0
+#define     RU                                                 1
+#define     RD                                                 2
+#define     RTZ                                                3
+   uint32_t                                     RoundingMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         PushConstantEnable;
+   bool                                         AttributeEnable;
+   bool                                         oMaskPresenttoRenderTarget;
+   bool                                         RenderTargetFastClearEnable;
+   bool                                         DualSourceBlendEnable;
+   bool                                         RenderTargetResolveEnable;
+#define     POSOFFSET_NONE                                     0
+#define     POSOFFSET_CENTROID                                 2
+#define     POSOFFSET_SAMPLE                                   3
+   uint32_t                                     PositionXYOffsetSelect;
+   bool                                         _32PixelDispatchEnable;
+   bool                                         _16PixelDispatchEnable;
+   bool                                         _8PixelDispatchEnable;
+   uint32_t                                     DispatchGRFStartRegisterforConstantSetupData0;
+   uint32_t                                     DispatchGRFStartRegisterforConstantSetupData1;
+   uint32_t                                     DispatchGRFStartRegisterforConstantSetupData2;
+   uint32_t                                     KernelStartPointer1;
+   uint32_t                                     KernelStartPointer2;
+};
+
+static inline void
+GEN7_3DSTATE_PS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_3DSTATE_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointer0, 6, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SingleProgramFlowSPF, 31, 31) |
+      __gen_field(values->VectorMaskEnableVME, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->DenormalMode, 26, 26) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->RoundingMode, 14, 15) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->MaximumNumberofThreads, 24, 31) |
+      __gen_field(values->PushConstantEnable, 11, 11) |
+      __gen_field(values->AttributeEnable, 10, 10) |
+      __gen_field(values->oMaskPresenttoRenderTarget, 9, 9) |
+      __gen_field(values->RenderTargetFastClearEnable, 8, 8) |
+      __gen_field(values->DualSourceBlendEnable, 7, 7) |
+      __gen_field(values->RenderTargetResolveEnable, 6, 6) |
+      __gen_field(values->PositionXYOffsetSelect, 3, 4) |
+      __gen_field(values->_32PixelDispatchEnable, 2, 2) |
+      __gen_field(values->_16PixelDispatchEnable, 1, 1) |
+      __gen_field(values->_8PixelDispatchEnable, 0, 0) |
+      0;
+
+   dw[5] =
+      __gen_field(values->DispatchGRFStartRegisterforConstantSetupData0, 16, 22) |
+      __gen_field(values->DispatchGRFStartRegisterforConstantSetupData1, 8, 14) |
+      __gen_field(values->DispatchGRFStartRegisterforConstantSetupData2, 0, 6) |
+      0;
+
+   dw[6] =
+      __gen_offset(values->KernelStartPointer1, 6, 31) |
+      0;
+
+   dw[7] =
+      __gen_offset(values->KernelStartPointer2, 6, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_DS_length_bias 0x00000002
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 20,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_DS_length 0x00000002
+
+struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferOffset;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_DS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 19) |
+      __gen_field(values->ConstantBufferSize, 0, 4) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_GS_length_bias 0x00000002
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 21,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_GS_length 0x00000002
+
+struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferOffset;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_GS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 19) |
+      __gen_field(values->ConstantBufferSize, 0, 4) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_HS_length_bias 0x00000002
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 19,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_HS_length 0x00000002
+
+struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferOffset;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_HS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 19) |
+      __gen_field(values->ConstantBufferSize, 0, 4) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS_length_bias 0x00000002
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 22,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS_length 0x00000002
+
+struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferOffset;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 19) |
+      __gen_field(values->ConstantBufferSize, 0, 4) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS_length_bias 0x00000002
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 18,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS_length 0x00000002
+
+struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferOffset;
+#define     _0KB                                               0
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 19) |
+      __gen_field(values->ConstantBufferSize, 0, 4) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SAMPLER_PALETTE_LOAD0_length_bias 0x00000002
+#define GEN7_3DSTATE_SAMPLER_PALETTE_LOAD0_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  2
+
+#define GEN7_3DSTATE_SAMPLER_PALETTE_LOAD0_length 0x00000000
+
+#define GEN7_PALETTE_ENTRY_length 0x00000001
+
+struct GEN7_PALETTE_ENTRY {
+   uint32_t                                     Alpha;
+   uint32_t                                     Red;
+   uint32_t                                     Green;
+   uint32_t                                     Blue;
+};
+
+static inline void
+GEN7_PALETTE_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN7_PALETTE_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->Alpha, 24, 31) |
+      __gen_field(values->Red, 16, 23) |
+      __gen_field(values->Green, 8, 15) |
+      __gen_field(values->Blue, 0, 7) |
+      0;
+
+}
+
+struct GEN7_3DSTATE_SAMPLER_PALETTE_LOAD0 {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_3DSTATE_SAMPLER_PALETTE_LOAD0_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN7_3DSTATE_SAMPLER_PALETTE_LOAD0 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN7_3DSTATE_SAMPLER_PALETTE_LOAD1_length_bias 0x00000002
+#define GEN7_3DSTATE_SAMPLER_PALETTE_LOAD1_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 12
+
+#define GEN7_3DSTATE_SAMPLER_PALETTE_LOAD1_length 0x00000000
+
+struct GEN7_3DSTATE_SAMPLER_PALETTE_LOAD1 {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_3DSTATE_SAMPLER_PALETTE_LOAD1_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN7_3DSTATE_SAMPLER_PALETTE_LOAD1 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_DS_length_bias 0x00000002
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 45,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_DS_length 0x00000002
+
+struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDSSamplerState;
+};
+
+static inline void
+GEN7_3DSTATE_SAMPLER_STATE_POINTERS_DS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS_length_bias 0x00000002
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 46,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS_length 0x00000002
+
+struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoGSSamplerState;
+};
+
+static inline void
+GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoGSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_HS_length_bias 0x00000002
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 44,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_HS_length 0x00000002
+
+struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoHSSamplerState;
+};
+
+static inline void
+GEN7_3DSTATE_SAMPLER_STATE_POINTERS_HS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoHSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS_length_bias 0x00000002
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 47,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS_length 0x00000002
+
+struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoPSSamplerState;
+};
+
+static inline void
+GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoPSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS_length_bias 0x00000002
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 43,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS_length 0x00000002
+
+struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoVSSamplerState;
+};
+
+static inline void
+GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoVSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SAMPLE_MASK_length_bias 0x00000002
+#define GEN7_3DSTATE_SAMPLE_MASK_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 24,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_SAMPLE_MASK_length 0x00000002
+
+struct GEN7_3DSTATE_SAMPLE_MASK {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SampleMask;
+};
+
+static inline void
+GEN7_3DSTATE_SAMPLE_MASK_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_3DSTATE_SAMPLE_MASK * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SampleMask, 0, 7) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SBE_length_bias 0x00000002
+#define GEN7_3DSTATE_SBE_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 31,                  \
+   .DwordLength          = 12
+
+#define GEN7_3DSTATE_SBE_length 0x0000000e
+
+struct GEN7_3DSTATE_SBE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     SWIZ_0_15                                          0
+#define     SWIZ_16_31                                         1
+   uint32_t                                     AttributeSwizzleControlMode;
+   uint32_t                                     NumberofSFOutputAttributes;
+   bool                                         AttributeSwizzleEnable;
+#define     UPPERLEFT                                          0
+#define     LOWERLEFT                                          1
+   uint32_t                                     PointSpriteTextureCoordinateOrigin;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   bool                                         Attribute2n1ComponentOverrideW;
+   bool                                         Attribute2n1ComponentOverrideZ;
+   bool                                         Attribute2n1ComponentOverrideY;
+   bool                                         Attribute2n1ComponentOverrideX;
+#define     CONST_0000                                         0
+#define     CONST_0001_FLOAT                                   1
+#define     CONST_1111_FLOAT                                   2
+#define     PRIM_ID                                            3
+   uint32_t                                     Attribute2n1ConstantSource;
+#define     INPUTATTR                                          0
+#define     INPUTATTR_FACING                                   1
+#define     INPUTATTR_W                                        2
+#define     INPUTATTR_FACING_W                                 3
+   uint32_t                                     Attribute2n1SwizzleSelect;
+   uint32_t                                     Attribute2n1SourceAttribute;
+   bool                                         Attribute2nComponentOverrideW;
+   bool                                         Attribute2nComponentOverrideZ;
+   bool                                         Attribute2nComponentOverrideY;
+   bool                                         Attribute2nComponentOverrideX;
+#define     CONST_0000                                         0
+#define     CONST_0001_FLOAT                                   1
+#define     CONST_1111_FLOAT                                   2
+#define     PRIM_ID                                            3
+   uint32_t                                     Attribute2nConstantSource;
+#define     INPUTATTR                                          0
+#define     INPUTATTR_FACING                                   1
+#define     INPUTATTR_W                                        2
+#define     INPUTATTR_FACING_W                                 3
+   uint32_t                                     Attribute2nSwizzleSelect;
+   uint32_t                                     Attribute2nSourceAttribute;
+   uint32_t                                     PointSpriteTextureCoordinateEnable;
+   uint32_t                                     ConstantInterpolationEnable310;
+   uint32_t                                     Attribute7WrapShortestEnables;
+   uint32_t                                     Attribute6WrapShortestEnables;
+   uint32_t                                     Attribute5WrapShortestEnables;
+   uint32_t                                     Attribute4WrapShortestEnables;
+   uint32_t                                     Attribute3WrapShortestEnables;
+   uint32_t                                     Attribute2WrapShortestEnables;
+   uint32_t                                     Attribute1WrapShortestEnables;
+   uint32_t                                     Attribute0WrapShortestEnables;
+   uint32_t                                     Attribute15WrapShortestEnables;
+   uint32_t                                     Attribute14WrapShortestEnables;
+   uint32_t                                     Attribute13WrapShortestEnables;
+   uint32_t                                     Attribute12WrapShortestEnables;
+   uint32_t                                     Attribute11WrapShortestEnables;
+   uint32_t                                     Attribute10WrapShortestEnables;
+   uint32_t                                     Attribute9WrapShortestEnables;
+   uint32_t                                     Attribute8WrapShortestEnables;
+};
+
+static inline void
+GEN7_3DSTATE_SBE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN7_3DSTATE_SBE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AttributeSwizzleControlMode, 28, 28) |
+      __gen_field(values->NumberofSFOutputAttributes, 22, 27) |
+      __gen_field(values->AttributeSwizzleEnable, 21, 21) |
+      __gen_field(values->PointSpriteTextureCoordinateOrigin, 20, 20) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 15) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Attribute2n1ComponentOverrideW, 31, 31) |
+      __gen_field(values->Attribute2n1ComponentOverrideZ, 30, 30) |
+      __gen_field(values->Attribute2n1ComponentOverrideY, 29, 29) |
+      __gen_field(values->Attribute2n1ComponentOverrideX, 28, 28) |
+      __gen_field(values->Attribute2n1ConstantSource, 25, 26) |
+      __gen_field(values->Attribute2n1SwizzleSelect, 22, 23) |
+      __gen_field(values->Attribute2n1SourceAttribute, 16, 20) |
+      __gen_field(values->Attribute2nComponentOverrideW, 15, 15) |
+      __gen_field(values->Attribute2nComponentOverrideZ, 14, 14) |
+      __gen_field(values->Attribute2nComponentOverrideY, 13, 13) |
+      __gen_field(values->Attribute2nComponentOverrideX, 12, 12) |
+      __gen_field(values->Attribute2nConstantSource, 9, 10) |
+      __gen_field(values->Attribute2nSwizzleSelect, 6, 7) |
+      __gen_field(values->Attribute2nSourceAttribute, 0, 4) |
+      0;
+
+   dw[10] =
+      __gen_field(values->PointSpriteTextureCoordinateEnable, 0, 31) |
+      0;
+
+   dw[11] =
+      __gen_field(values->ConstantInterpolationEnable310, 0, 31) |
+      0;
+
+   dw[12] =
+      __gen_field(values->Attribute7WrapShortestEnables, 28, 31) |
+      __gen_field(values->Attribute6WrapShortestEnables, 24, 27) |
+      __gen_field(values->Attribute5WrapShortestEnables, 20, 23) |
+      __gen_field(values->Attribute4WrapShortestEnables, 16, 19) |
+      __gen_field(values->Attribute3WrapShortestEnables, 12, 15) |
+      __gen_field(values->Attribute2WrapShortestEnables, 8, 11) |
+      __gen_field(values->Attribute1WrapShortestEnables, 4, 7) |
+      __gen_field(values->Attribute0WrapShortestEnables, 0, 3) |
+      0;
+
+   dw[13] =
+      __gen_field(values->Attribute15WrapShortestEnables, 28, 31) |
+      __gen_field(values->Attribute14WrapShortestEnables, 24, 27) |
+      __gen_field(values->Attribute13WrapShortestEnables, 20, 23) |
+      __gen_field(values->Attribute12WrapShortestEnables, 16, 19) |
+      __gen_field(values->Attribute11WrapShortestEnables, 12, 15) |
+      __gen_field(values->Attribute10WrapShortestEnables, 8, 11) |
+      __gen_field(values->Attribute9WrapShortestEnables, 4, 7) |
+      __gen_field(values->Attribute8WrapShortestEnables, 0, 3) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SCISSOR_STATE_POINTERS_length_bias 0x00000002
+#define GEN7_3DSTATE_SCISSOR_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 15,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_SCISSOR_STATE_POINTERS_length 0x00000002
+
+struct GEN7_3DSTATE_SCISSOR_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ScissorRectPointer;
+};
+
+static inline void
+GEN7_3DSTATE_SCISSOR_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN7_3DSTATE_SCISSOR_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ScissorRectPointer, 5, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SF_length_bias 0x00000002
+#define GEN7_3DSTATE_SF_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 19,                  \
+   .DwordLength          =  5
+
+#define GEN7_3DSTATE_SF_length 0x00000007
+
+struct GEN7_3DSTATE_SF {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     D32_FLOAT_S8X24_UINT                               0
+#define     D32_FLOAT                                          1
+#define     D24_UNORM_S8_UINT                                  2
+#define     D24_UNORM_X8_UINT                                  3
+#define     D16_UNORM                                          5
+   uint32_t                                     DepthBufferSurfaceFormat;
+   bool                                         LegacyGlobalDepthBiasEnable;
+   bool                                         StatisticsEnable;
+   bool                                         GlobalDepthOffsetEnableSolid;
+   bool                                         GlobalDepthOffsetEnableWireframe;
+   bool                                         GlobalDepthOffsetEnablePoint;
+#define     RASTER_SOLID                                       0
+#define     RASTER_WIREFRAME                                   1
+#define     RASTER_POINT                                       2
+   uint32_t                                     FrontFaceFillMode;
+#define     RASTER_SOLID                                       0
+#define     RASTER_WIREFRAME                                   1
+#define     RASTER_POINT                                       2
+   uint32_t                                     BackFaceFillMode;
+   bool                                         ViewTransformEnable;
+   uint32_t                                     FrontWinding;
+   bool                                         AntiAliasingEnable;
+#define     CULLMODE_BOTH                                      0
+#define     CULLMODE_NONE                                      1
+#define     CULLMODE_FRONT                                     2
+#define     CULLMODE_BACK                                      3
+   uint32_t                                     CullMode;
+   float                                        LineWidth;
+   uint32_t                                     LineEndCapAntialiasingRegionWidth;
+   bool                                         ScissorRectangleEnable;
+   uint32_t                                     MultisampleRasterizationMode;
+   bool                                         LastPixelEnable;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+#define     Vertex2                                            2
+   uint32_t                                     TriangleStripListProvokingVertexSelect;
+   uint32_t                                     LineStripListProvokingVertexSelect;
+#define     Vertex0                                            0
+#define     Vertex1                                            1
+#define     Vertex2                                            2
+   uint32_t                                     TriangleFanProvokingVertexSelect;
+#define     AALINEDISTANCE_TRUE                                1
+   uint32_t                                     AALineDistanceMode;
+   uint32_t                                     VertexSubPixelPrecisionSelect;
+   uint32_t                                     UsePointWidthState;
+   float                                        PointWidth;
+   float                                        GlobalDepthOffsetConstant;
+   float                                        GlobalDepthOffsetScale;
+   float                                        GlobalDepthOffsetClamp;
+};
+
+static inline void
+GEN7_3DSTATE_SF_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_3DSTATE_SF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DepthBufferSurfaceFormat, 12, 14) |
+      __gen_field(values->LegacyGlobalDepthBiasEnable, 11, 11) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->GlobalDepthOffsetEnableSolid, 9, 9) |
+      __gen_field(values->GlobalDepthOffsetEnableWireframe, 8, 8) |
+      __gen_field(values->GlobalDepthOffsetEnablePoint, 7, 7) |
+      __gen_field(values->FrontFaceFillMode, 5, 6) |
+      __gen_field(values->BackFaceFillMode, 3, 4) |
+      __gen_field(values->ViewTransformEnable, 1, 1) |
+      __gen_field(values->FrontWinding, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_field(values->AntiAliasingEnable, 31, 31) |
+      __gen_field(values->CullMode, 29, 30) |
+      __gen_field(values->LineWidth * (1 << 7), 18, 27) |
+      __gen_field(values->LineEndCapAntialiasingRegionWidth, 16, 17) |
+      __gen_field(values->ScissorRectangleEnable, 11, 11) |
+      __gen_field(values->MultisampleRasterizationMode, 8, 9) |
+      0;
+
+   dw[3] =
+      __gen_field(values->LastPixelEnable, 31, 31) |
+      __gen_field(values->TriangleStripListProvokingVertexSelect, 29, 30) |
+      __gen_field(values->LineStripListProvokingVertexSelect, 27, 28) |
+      __gen_field(values->TriangleFanProvokingVertexSelect, 25, 26) |
+      __gen_field(values->AALineDistanceMode, 14, 14) |
+      __gen_field(values->VertexSubPixelPrecisionSelect, 12, 12) |
+      __gen_field(values->UsePointWidthState, 11, 11) |
+      __gen_field(values->PointWidth * (1 << 3), 0, 10) |
+      0;
+
+   dw[4] =
+      __gen_float(values->GlobalDepthOffsetConstant) |
+      0;
+
+   dw[5] =
+      __gen_float(values->GlobalDepthOffsetScale) |
+      0;
+
+   dw[6] =
+      __gen_float(values->GlobalDepthOffsetClamp) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_SO_BUFFER_length_bias 0x00000002
+#define GEN7_3DSTATE_SO_BUFFER_header           \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 24,                  \
+   .DwordLength          =  2
+
+#define GEN7_3DSTATE_SO_BUFFER_length 0x00000004
+
+struct GEN7_3DSTATE_SO_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SOBufferIndex;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      SOBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   __gen_address_type                           SurfaceEndAddress;
+};
+
+static inline void
+GEN7_3DSTATE_SO_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN7_3DSTATE_SO_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SOBufferObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SOBufferObjectControlState, &values->SOBufferObjectControlState);
+   dw[1] =
+      __gen_field(values->SOBufferIndex, 29, 30) |
+      __gen_field(dw_SOBufferObjectControlState, 25, 28) |
+      __gen_field(values->SurfacePitch, 0, 11) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->SurfaceEndAddress, dw3);
+
+}
+
+#define GEN7_3DSTATE_SO_DECL_LIST_length_bias 0x00000002
+#define GEN7_3DSTATE_SO_DECL_LIST_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 23
+
+#define GEN7_3DSTATE_SO_DECL_LIST_length 0x00000000
+
+#define GEN7_SO_DECL_ENTRY_length 0x00000002
+
+#define GEN7_SO_DECL_length 0x00000001
+
+struct GEN7_SO_DECL {
+   uint32_t                                     OutputBufferSlot;
+   uint32_t                                     HoleFlag;
+   uint32_t                                     RegisterIndex;
+   uint32_t                                     ComponentMask;
+};
+
+static inline void
+GEN7_SO_DECL_pack(__gen_user_data *data, void * restrict dst,
+                  const struct GEN7_SO_DECL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->OutputBufferSlot, 12, 13) |
+      __gen_field(values->HoleFlag, 11, 11) |
+      __gen_field(values->RegisterIndex, 4, 9) |
+      __gen_field(values->ComponentMask, 0, 3) |
+      0;
+
+}
+
+struct GEN7_SO_DECL_ENTRY {
+   struct GEN7_SO_DECL                          Stream3Decl;
+   struct GEN7_SO_DECL                          Stream2Decl;
+   struct GEN7_SO_DECL                          Stream1Decl;
+   struct GEN7_SO_DECL                          Stream0Decl;
+};
+
+static inline void
+GEN7_SO_DECL_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN7_SO_DECL_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_Stream3Decl;
+   GEN7_SO_DECL_pack(data, &dw_Stream3Decl, &values->Stream3Decl);
+   uint32_t dw_Stream2Decl;
+   GEN7_SO_DECL_pack(data, &dw_Stream2Decl, &values->Stream2Decl);
+   uint32_t dw_Stream1Decl;
+   GEN7_SO_DECL_pack(data, &dw_Stream1Decl, &values->Stream1Decl);
+   uint32_t dw_Stream0Decl;
+   GEN7_SO_DECL_pack(data, &dw_Stream0Decl, &values->Stream0Decl);
+   uint64_t qw0 =
+      __gen_field(dw_Stream3Decl, 48, 63) |
+      __gen_field(dw_Stream2Decl, 32, 47) |
+      __gen_field(dw_Stream1Decl, 16, 31) |
+      __gen_field(dw_Stream0Decl, 0, 15) |
+      0;
+
+   dw[0] = qw0;
+   dw[1] = qw0 >> 32;
+
+}
+
+struct GEN7_3DSTATE_SO_DECL_LIST {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StreamtoBufferSelects3;
+   uint32_t                                     StreamtoBufferSelects2;
+   uint32_t                                     StreamtoBufferSelects1;
+   uint32_t                                     StreamtoBufferSelects0;
+   uint32_t                                     NumEntries3;
+   uint32_t                                     NumEntries2;
+   uint32_t                                     NumEntries1;
+   uint32_t                                     NumEntries0;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_3DSTATE_SO_DECL_LIST_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_3DSTATE_SO_DECL_LIST * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StreamtoBufferSelects3, 12, 15) |
+      __gen_field(values->StreamtoBufferSelects2, 8, 11) |
+      __gen_field(values->StreamtoBufferSelects1, 4, 7) |
+      __gen_field(values->StreamtoBufferSelects0, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->NumEntries3, 24, 31) |
+      __gen_field(values->NumEntries2, 16, 23) |
+      __gen_field(values->NumEntries1, 8, 15) |
+      __gen_field(values->NumEntries0, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN7_3DSTATE_STENCIL_BUFFER_length_bias 0x00000002
+#define GEN7_3DSTATE_STENCIL_BUFFER_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  6,                  \
+   .DwordLength          =  1
+
+#define GEN7_3DSTATE_STENCIL_BUFFER_length 0x00000003
+
+struct GEN7_3DSTATE_STENCIL_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      StencilBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+};
+
+static inline void
+GEN7_3DSTATE_STENCIL_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN7_3DSTATE_STENCIL_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_StencilBufferObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_StencilBufferObjectControlState, &values->StencilBufferObjectControlState);
+   dw[1] =
+      __gen_field(dw_StencilBufferObjectControlState, 25, 28) |
+      __gen_field(values->SurfacePitch, 0, 16) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+}
+
+#define GEN7_3DSTATE_STREAMOUT_length_bias 0x00000002
+#define GEN7_3DSTATE_STREAMOUT_header           \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 30,                  \
+   .DwordLength          =  1
+
+#define GEN7_3DSTATE_STREAMOUT_length 0x00000003
+
+struct GEN7_3DSTATE_STREAMOUT {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SOFunctionEnable;
+   uint32_t                                     RenderingDisable;
+   uint32_t                                     RenderStreamSelect;
+#define     LEADING                                            0
+#define     TRAILING                                           1
+   uint32_t                                     ReorderMode;
+   bool                                         SOStatisticsEnable;
+   uint32_t                                     SOBufferEnable3;
+   uint32_t                                     SOBufferEnable2;
+   uint32_t                                     SOBufferEnable1;
+   uint32_t                                     SOBufferEnable0;
+   uint32_t                                     Stream3VertexReadOffset;
+   uint32_t                                     Stream3VertexReadLength;
+   uint32_t                                     Stream2VertexReadOffset;
+   uint32_t                                     Stream2VertexReadLength;
+   uint32_t                                     Stream1VertexReadOffset;
+   uint32_t                                     Stream1VertexReadLength;
+   uint32_t                                     Stream0VertexReadOffset;
+   uint32_t                                     Stream0VertexReadLength;
+};
+
+static inline void
+GEN7_3DSTATE_STREAMOUT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN7_3DSTATE_STREAMOUT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SOFunctionEnable, 31, 31) |
+      __gen_field(values->RenderingDisable, 30, 30) |
+      __gen_field(values->RenderStreamSelect, 27, 28) |
+      __gen_field(values->ReorderMode, 26, 26) |
+      __gen_field(values->SOStatisticsEnable, 25, 25) |
+      __gen_field(values->SOBufferEnable3, 11, 11) |
+      __gen_field(values->SOBufferEnable2, 10, 10) |
+      __gen_field(values->SOBufferEnable1, 9, 9) |
+      __gen_field(values->SOBufferEnable0, 8, 8) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Stream3VertexReadOffset, 29, 29) |
+      __gen_field(values->Stream3VertexReadLength, 24, 28) |
+      __gen_field(values->Stream2VertexReadOffset, 21, 21) |
+      __gen_field(values->Stream2VertexReadLength, 16, 20) |
+      __gen_field(values->Stream1VertexReadOffset, 13, 13) |
+      __gen_field(values->Stream1VertexReadLength, 8, 12) |
+      __gen_field(values->Stream0VertexReadOffset, 5, 5) |
+      __gen_field(values->Stream0VertexReadLength, 0, 4) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_TE_length_bias 0x00000002
+#define GEN7_3DSTATE_TE_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 28,                  \
+   .DwordLength          =  2
+
+#define GEN7_3DSTATE_TE_length 0x00000004
+
+struct GEN7_3DSTATE_TE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     INTEGER                                            0
+#define     ODD_FRACTIONAL                                     1
+#define     EVEN_FRACTIONAL                                    2
+   uint32_t                                     Partitioning;
+#define     POINT                                              0
+#define     OUTPUT_LINE                                        1
+#define     OUTPUT_TRI_CW                                      2
+#define     OUTPUT_TRI_CCW                                     3
+   uint32_t                                     OutputTopology;
+#define     QUAD                                               0
+#define     TRI                                                1
+#define     ISOLINE                                            2
+   uint32_t                                     TEDomain;
+#define     HW_TESS                                            0
+#define     SW_TESS                                            1
+   uint32_t                                     TEMode;
+   bool                                         TEEnable;
+   float                                        MaximumTessellationFactorOdd;
+   float                                        MaximumTessellationFactorNotOdd;
+};
+
+static inline void
+GEN7_3DSTATE_TE_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_3DSTATE_TE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Partitioning, 12, 13) |
+      __gen_field(values->OutputTopology, 8, 9) |
+      __gen_field(values->TEDomain, 4, 5) |
+      __gen_field(values->TEMode, 1, 2) |
+      __gen_field(values->TEEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_float(values->MaximumTessellationFactorOdd) |
+      0;
+
+   dw[3] =
+      __gen_float(values->MaximumTessellationFactorNotOdd) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_URB_DS_length_bias 0x00000002
+#define GEN7_3DSTATE_URB_DS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 50,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_URB_DS_length 0x00000002
+
+struct GEN7_3DSTATE_URB_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     DSURBStartingAddress;
+   uint32_t                                     DSURBEntryAllocationSize;
+   uint32_t                                     DSNumberofURBEntries;
+};
+
+static inline void
+GEN7_3DSTATE_URB_DS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN7_3DSTATE_URB_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DSURBStartingAddress, 25, 29) |
+      __gen_field(values->DSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->DSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_URB_GS_length_bias 0x00000002
+#define GEN7_3DSTATE_URB_GS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 51,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_URB_GS_length 0x00000002
+
+struct GEN7_3DSTATE_URB_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     GSURBStartingAddress;
+   uint32_t                                     GSURBEntryAllocationSize;
+   uint32_t                                     GSNumberofURBEntries;
+};
+
+static inline void
+GEN7_3DSTATE_URB_GS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN7_3DSTATE_URB_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->GSURBStartingAddress, 25, 29) |
+      __gen_field(values->GSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->GSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_URB_HS_length_bias 0x00000002
+#define GEN7_3DSTATE_URB_HS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 49,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_URB_HS_length 0x00000002
+
+struct GEN7_3DSTATE_URB_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     HSURBStartingAddress;
+   uint32_t                                     HSURBEntryAllocationSize;
+   uint32_t                                     HSNumberofURBEntries;
+};
+
+static inline void
+GEN7_3DSTATE_URB_HS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN7_3DSTATE_URB_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->HSURBStartingAddress, 25, 29) |
+      __gen_field(values->HSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->HSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_VERTEX_BUFFERS_length_bias 0x00000002
+#define GEN7_3DSTATE_VERTEX_BUFFERS_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  8
+
+#define GEN7_3DSTATE_VERTEX_BUFFERS_length 0x00000000
+
+#define GEN7_VERTEX_BUFFER_STATE_length 0x00000004
+
+struct GEN7_VERTEX_BUFFER_STATE {
+   uint32_t                                     VertexBufferIndex;
+#define     VERTEXDATA                                         0
+#define     INSTANCEDATA                                       1
+   uint32_t                                     BufferAccessType;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      VertexBufferMemoryObjectControlState;
+   uint32_t                                     AddressModifyEnable;
+   bool                                         NullVertexBuffer;
+   uint32_t                                     VertexFetchInvalidate;
+   uint32_t                                     BufferPitch;
+   __gen_address_type                           BufferStartingAddress;
+   __gen_address_type                           EndAddress;
+   uint32_t                                     InstanceDataStepRate;
+};
+
+static inline void
+GEN7_VERTEX_BUFFER_STATE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_VERTEX_BUFFER_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_VertexBufferMemoryObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_VertexBufferMemoryObjectControlState, &values->VertexBufferMemoryObjectControlState);
+   dw[0] =
+      __gen_field(values->VertexBufferIndex, 26, 31) |
+      __gen_field(values->BufferAccessType, 20, 20) |
+      __gen_field(dw_VertexBufferMemoryObjectControlState, 16, 19) |
+      __gen_field(values->AddressModifyEnable, 14, 14) |
+      __gen_field(values->NullVertexBuffer, 13, 13) |
+      __gen_field(values->VertexFetchInvalidate, 12, 12) |
+      __gen_field(values->BufferPitch, 0, 11) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->BufferStartingAddress, dw1);
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->EndAddress, dw2);
+
+   dw[3] =
+      __gen_field(values->InstanceDataStepRate, 0, 31) |
+      0;
+
+}
+
+struct GEN7_3DSTATE_VERTEX_BUFFERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_3DSTATE_VERTEX_BUFFERS_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN7_3DSTATE_VERTEX_BUFFERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN7_3DSTATE_VERTEX_ELEMENTS_length_bias 0x00000002
+#define GEN7_3DSTATE_VERTEX_ELEMENTS_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  9
+
+#define GEN7_3DSTATE_VERTEX_ELEMENTS_length 0x00000000
+
+#define GEN7_VERTEX_ELEMENT_STATE_length 0x00000002
+
+struct GEN7_VERTEX_ELEMENT_STATE {
+   uint32_t                                     VertexBufferIndex;
+   bool                                         Valid;
+   uint32_t                                     SourceElementFormat;
+   bool                                         EdgeFlagEnable;
+   uint32_t                                     SourceElementOffset;
+   uint32_t                                     Component0Control;
+   uint32_t                                     Component1Control;
+   uint32_t                                     Component2Control;
+   uint32_t                                     Component3Control;
+};
+
+static inline void
+GEN7_VERTEX_ELEMENT_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_VERTEX_ELEMENT_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->VertexBufferIndex, 26, 31) |
+      __gen_field(values->Valid, 25, 25) |
+      __gen_field(values->SourceElementFormat, 16, 24) |
+      __gen_field(values->EdgeFlagEnable, 15, 15) |
+      __gen_field(values->SourceElementOffset, 0, 11) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Component0Control, 28, 30) |
+      __gen_field(values->Component1Control, 24, 26) |
+      __gen_field(values->Component2Control, 20, 22) |
+      __gen_field(values->Component3Control, 16, 18) |
+      0;
+
+}
+
+struct GEN7_3DSTATE_VERTEX_ELEMENTS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_3DSTATE_VERTEX_ELEMENTS_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN7_3DSTATE_VERTEX_ELEMENTS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN7_3DSTATE_VF_STATISTICS_length_bias 0x00000001
+#define GEN7_3DSTATE_VF_STATISTICS_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  1,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 11
+
+#define GEN7_3DSTATE_VF_STATISTICS_length 0x00000001
+
+struct GEN7_3DSTATE_VF_STATISTICS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         StatisticsEnable;
+};
+
+static inline void
+GEN7_3DSTATE_VF_STATISTICS_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN7_3DSTATE_VF_STATISTICS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->StatisticsEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC_length_bias 0x00000002
+#define GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 35,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC_length 0x00000002
+
+struct GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CCViewportPointer;
+};
+
+static inline void
+GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->CCViewportPointer, 5, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_length_bias 0x00000002
+#define GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 33,                  \
+   .DwordLength          =  0
+
+#define GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_length 0x00000002
+
+struct GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SFClipViewportPointer;
+};
+
+static inline void
+GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_pack(__gen_user_data *data, void * restrict dst,
+                                                  const struct GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SFClipViewportPointer, 6, 31) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_VS_length_bias 0x00000002
+#define GEN7_3DSTATE_VS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 16,                  \
+   .DwordLength          =  4
+
+#define GEN7_3DSTATE_VS_length 0x00000006
+
+struct GEN7_3DSTATE_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleVertexDispatch;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnableVME;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ScratchSpaceBaseOffset;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterforURBData;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         StatisticsEnable;
+   bool                                         VertexCacheDisable;
+   bool                                         VSFunctionEnable;
+};
+
+static inline void
+GEN7_3DSTATE_VS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_3DSTATE_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SingleVertexDispatch, 31, 31) |
+      __gen_field(values->VectorMaskEnableVME, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->ScratchSpaceBaseOffset, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->DispatchGRFStartRegisterforURBData, 20, 24) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[5] =
+      __gen_field(values->MaximumNumberofThreads, 25, 31) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->VertexCacheDisable, 1, 1) |
+      __gen_field(values->VSFunctionEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN7_3DSTATE_WM_length_bias 0x00000002
+#define GEN7_3DSTATE_WM_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 20,                  \
+   .DwordLength          =  1
+
+#define GEN7_3DSTATE_WM_length 0x00000003
+
+struct GEN7_3DSTATE_WM {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         StatisticsEnable;
+   bool                                         DepthBufferClear;
+   bool                                         ThreadDispatchEnable;
+   bool                                         DepthBufferResolveEnable;
+   bool                                         HierarchicalDepthBufferResolveEnable;
+   bool                                         LegacyDiamondLineRasterization;
+   bool                                         PixelShaderKillPixel;
+#define     PSCDEPTH_OFF                                       0
+#define     PSCDEPTH_ON                                        1
+#define     PSCDEPTH_ON_GE                                     2
+#define     PSCDEPTH_ON_LE                                     3
+   uint32_t                                     PixelShaderComputedDepthMode;
+#define     EDSC_NORMAL                                        0
+#define     EDSC_PSEXEC                                        1
+#define     EDSC_PREPS                                         2
+   uint32_t                                     EarlyDepthStencilControl;
+   bool                                         PixelShaderUsesSourceDepth;
+   bool                                         PixelShaderUsesSourceW;
+#define     INTERP_PIXEL                                       0
+#define     INTERP_CENTROID                                    2
+#define     INTERP_SAMPLE                                      3
+   uint32_t                                     PositionZWInterpolationMode;
+   uint32_t                                     BarycentricInterpolationMode;
+   bool                                         PixelShaderUsesInputCoverageMask;
+   uint32_t                                     LineEndCapAntialiasingRegionWidth;
+   uint32_t                                     LineAntialiasingRegionWidth;
+   bool                                         PolygonStippleEnable;
+   bool                                         LineStippleEnable;
+#define     RASTRULE_UPPER_LEFT                                0
+#define     RASTRULE_UPPER_RIGHT                               1
+   uint32_t                                     PointRasterizationRule;
+#define     MSRASTMODE_OFF_PIXEL                               0
+#define     MSRASTMODE_OFF_PATTERN                             1
+#define     MSRASTMODE_ON_PIXEL                                2
+#define     MSRASTMODE_ON_PATTERN                              3
+   uint32_t                                     MultisampleRasterizationMode;
+#define     MSDISPMODE_PERSAMPLE                               0
+#define     MSDISPMODE_PERPIXEL                                1
+   uint32_t                                     MultisampleDispatchMode;
+};
+
+static inline void
+GEN7_3DSTATE_WM_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_3DSTATE_WM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StatisticsEnable, 31, 31) |
+      __gen_field(values->DepthBufferClear, 30, 30) |
+      __gen_field(values->ThreadDispatchEnable, 29, 29) |
+      __gen_field(values->DepthBufferResolveEnable, 28, 28) |
+      __gen_field(values->HierarchicalDepthBufferResolveEnable, 27, 27) |
+      __gen_field(values->LegacyDiamondLineRasterization, 26, 26) |
+      __gen_field(values->PixelShaderKillPixel, 25, 25) |
+      __gen_field(values->PixelShaderComputedDepthMode, 23, 24) |
+      __gen_field(values->EarlyDepthStencilControl, 21, 22) |
+      __gen_field(values->PixelShaderUsesSourceDepth, 20, 20) |
+      __gen_field(values->PixelShaderUsesSourceW, 19, 19) |
+      __gen_field(values->PositionZWInterpolationMode, 17, 18) |
+      __gen_field(values->BarycentricInterpolationMode, 11, 16) |
+      __gen_field(values->PixelShaderUsesInputCoverageMask, 10, 10) |
+      __gen_field(values->LineEndCapAntialiasingRegionWidth, 8, 9) |
+      __gen_field(values->LineAntialiasingRegionWidth, 6, 7) |
+      __gen_field(values->PolygonStippleEnable, 4, 4) |
+      __gen_field(values->LineStippleEnable, 3, 3) |
+      __gen_field(values->PointRasterizationRule, 2, 2) |
+      __gen_field(values->MultisampleRasterizationMode, 0, 1) |
+      0;
+
+   dw[2] =
+      __gen_field(values->MultisampleDispatchMode, 31, 31) |
+      0;
+
+}
+
+#define GEN7_GPGPU_OBJECT_length_bias 0x00000002
+#define GEN7_GPGPU_OBJECT_header                \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  4,                  \
+   .DwordLength          =  6
+
+#define GEN7_GPGPU_OBJECT_length 0x00000008
+
+struct GEN7_GPGPU_OBJECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SharedLocalMemoryFixedOffset;
+   uint32_t                                     InterfaceDescriptorOffset;
+   uint32_t                                     SharedLocalMemoryOffset;
+   uint32_t                                     EndofThreadGroup;
+#define     HalfSlice1                                         2
+#define     HalfSlice0                                         1
+#define     EitherHalfSlice                                    0
+   uint32_t                                     HalfSliceDestinationSelect;
+   uint32_t                                     IndirectDataLength;
+   uint32_t                                     IndirectDataStartAddress;
+   uint32_t                                     ThreadGroupIDX;
+   uint32_t                                     ThreadGroupIDY;
+   uint32_t                                     ThreadGroupIDZ;
+   uint32_t                                     ExecutionMask;
+};
+
+static inline void
+GEN7_GPGPU_OBJECT_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_GPGPU_OBJECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SharedLocalMemoryFixedOffset, 7, 7) |
+      __gen_field(values->InterfaceDescriptorOffset, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SharedLocalMemoryOffset, 28, 31) |
+      __gen_field(values->EndofThreadGroup, 24, 24) |
+      __gen_field(values->HalfSliceDestinationSelect, 17, 18) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->IndirectDataStartAddress, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->ThreadGroupIDX, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ThreadGroupIDY, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ThreadGroupIDZ, 0, 31) |
+      0;
+
+   dw[7] =
+      __gen_field(values->ExecutionMask, 0, 31) |
+      0;
+
+}
+
+#define GEN7_GPGPU_WALKER_length_bias 0x00000002
+#define GEN7_GPGPU_WALKER_header                \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcodeA           =  5,                  \
+   .DwordLength          =  9
+
+#define GEN7_GPGPU_WALKER_length 0x0000000b
+
+struct GEN7_GPGPU_WALKER {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcodeA;
+   bool                                         IndirectParameterEnable;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+#define     SIMD8                                              0
+#define     SIMD16                                             1
+#define     SIMD32                                             2
+   uint32_t                                     SIMDSize;
+   uint32_t                                     ThreadDepthCounterMaximum;
+   uint32_t                                     ThreadHeightCounterMaximum;
+   uint32_t                                     ThreadWidthCounterMaximum;
+   uint32_t                                     ThreadGroupIDStartingX;
+   uint32_t                                     ThreadGroupIDXDimension;
+   uint32_t                                     ThreadGroupIDStartingY;
+   uint32_t                                     ThreadGroupIDYDimension;
+   uint32_t                                     ThreadGroupIDStartingZ;
+   uint32_t                                     ThreadGroupIDZDimension;
+   uint32_t                                     RightExecutionMask;
+   uint32_t                                     BottomExecutionMask;
+};
+
+static inline void
+GEN7_GPGPU_WALKER_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_GPGPU_WALKER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcodeA, 16, 23) |
+      __gen_field(values->IndirectParameterEnable, 10, 10) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SIMDSize, 30, 31) |
+      __gen_field(values->ThreadDepthCounterMaximum, 16, 21) |
+      __gen_field(values->ThreadHeightCounterMaximum, 8, 13) |
+      __gen_field(values->ThreadWidthCounterMaximum, 0, 5) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ThreadGroupIDStartingX, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->ThreadGroupIDXDimension, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ThreadGroupIDStartingY, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ThreadGroupIDYDimension, 0, 31) |
+      0;
+
+   dw[7] =
+      __gen_field(values->ThreadGroupIDStartingZ, 0, 31) |
+      0;
+
+   dw[8] =
+      __gen_field(values->ThreadGroupIDZDimension, 0, 31) |
+      0;
+
+   dw[9] =
+      __gen_field(values->RightExecutionMask, 0, 31) |
+      0;
+
+   dw[10] =
+      __gen_field(values->BottomExecutionMask, 0, 31) |
+      0;
+
+}
+
+#define GEN7_MEDIA_CURBE_LOAD_length_bias 0x00000002
+#define GEN7_MEDIA_CURBE_LOAD_header            \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  1,                  \
+   .DwordLength          =  2
+
+#define GEN7_MEDIA_CURBE_LOAD_length 0x00000004
+
+struct GEN7_MEDIA_CURBE_LOAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CURBETotalDataLength;
+   uint32_t                                     CURBEDataStartAddress;
+};
+
+static inline void
+GEN7_MEDIA_CURBE_LOAD_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN7_MEDIA_CURBE_LOAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->CURBETotalDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_field(values->CURBEDataStartAddress, 0, 31) |
+      0;
+
+}
+
+#define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD_length_bias 0x00000002
+#define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD_header\
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  2,                  \
+   .DwordLength          =  2
+
+#define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD_length 0x00000004
+
+struct GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorTotalLength;
+   uint32_t                                     InterfaceDescriptorDataStartAddress;
+};
+
+static inline void
+GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->InterfaceDescriptorTotalLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->InterfaceDescriptorDataStartAddress, 0, 31) |
+      0;
+
+}
+
+#define GEN7_MEDIA_OBJECT_length_bias 0x00000002
+#define GEN7_MEDIA_OBJECT_header                \
+   .CommandType          =  3,                  \
+   .MediaCommandPipeline =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .MediaCommandSubOpcode =  0
+
+#define GEN7_MEDIA_OBJECT_length 0x00000000
+
+struct GEN7_MEDIA_OBJECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MediaCommandPipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     MediaCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+#define     Nothreadsynchronization                            0
+#define     Threaddispatchissynchronizedbythespawnrootthreadmessage       1
+   uint32_t                                     ThreadSynchronization;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+#define     HalfSlice1                                         2
+#define     HalfSlice0                                         1
+#define     Eitherhalfslice                                    0
+   uint32_t                                     HalfSliceDestinationSelect;
+   uint32_t                                     IndirectDataLength;
+   __gen_address_type                           IndirectDataStartAddress;
+   uint32_t                                     ScoredboardY;
+   uint32_t                                     ScoreboardX;
+   uint32_t                                     ScoreboardColor;
+   bool                                         ScoreboardMask;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_MEDIA_OBJECT_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_MEDIA_OBJECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MediaCommandPipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->MediaCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->ThreadSynchronization, 24, 24) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->HalfSliceDestinationSelect, 17, 18) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->IndirectDataStartAddress, dw3);
+
+   dw[4] =
+      __gen_field(values->ScoredboardY, 16, 24) |
+      __gen_field(values->ScoreboardX, 0, 8) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardColor, 16, 19) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN7_MEDIA_OBJECT_PRT_length_bias 0x00000002
+#define GEN7_MEDIA_OBJECT_PRT_header            \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  2,                  \
+   .DwordLength          = 14
+
+#define GEN7_MEDIA_OBJECT_PRT_length 0x00000010
+
+struct GEN7_MEDIA_OBJECT_PRT {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+   bool                                         PRT_FenceNeeded;
+#define     Rootthreadqueue                                    0
+#define     VFEstateflush                                      1
+   uint32_t                                     PRT_FenceType;
+   uint32_t                                     InlineData[12];
+};
+
+static inline void
+GEN7_MEDIA_OBJECT_PRT_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN7_MEDIA_OBJECT_PRT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->PRT_FenceNeeded, 23, 23) |
+      __gen_field(values->PRT_FenceType, 22, 22) |
+      0;
+
+   dw[3] =
+      0;
+
+   for (uint32_t i = 0, j = 4; i < 12; i += 1, j++) {
+      dw[j] =
+         __gen_field(values->InlineData[i + 0], 0, 31) |
+         0;
+   }
+
+}
+
+#define GEN7_MEDIA_OBJECT_WALKER_length_bias 0x00000002
+#define GEN7_MEDIA_OBJECT_WALKER_header         \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  3
+
+#define GEN7_MEDIA_OBJECT_WALKER_length 0x00000000
+
+struct GEN7_MEDIA_OBJECT_WALKER {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+#define     Nothreadsynchronization                            0
+#define     Threaddispatchissynchronizedbythespawnrootthreadmessage       1
+   uint32_t                                     ThreadSynchronization;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+   uint32_t                                     IndirectDataLength;
+   uint32_t                                     IndirectDataStartAddress;
+   bool                                         ScoreboardMask;
+   bool                                         DualMode;
+   bool                                         Repel;
+   uint32_t                                     ColorCountMinusOne;
+   uint32_t                                     MiddleLoopExtraSteps;
+   uint32_t                                     LocalMidLoopUnitY;
+   uint32_t                                     MidLoopUnitX;
+   uint32_t                                     GlobalLoopExecCount;
+   uint32_t                                     LocalLoopExecCount;
+   uint32_t                                     BlockResolutionY;
+   uint32_t                                     BlockResolutionX;
+   uint32_t                                     LocalStartY;
+   uint32_t                                     LocalStartX;
+   uint32_t                                     LocalEndY;
+   uint32_t                                     LocalEndX;
+   uint32_t                                     LocalOuterLoopStrideY;
+   uint32_t                                     LocalOuterLoopStrideX;
+   uint32_t                                     LocalInnerLoopUnitY;
+   uint32_t                                     LocalInnerLoopUnitX;
+   uint32_t                                     GlobalResolutionY;
+   uint32_t                                     GlobalResolutionX;
+   uint32_t                                     GlobalStartY;
+   uint32_t                                     GlobalStartX;
+   uint32_t                                     GlobalOuterLoopStrideY;
+   uint32_t                                     GlobalOuterLoopStrideX;
+   uint32_t                                     GlobalInnerLoopUnitY;
+   uint32_t                                     GlobalInnerLoopUnitX;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_MEDIA_OBJECT_WALKER_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_MEDIA_OBJECT_WALKER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->ThreadSynchronization, 24, 24) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->IndirectDataStartAddress, 0, 31) |
+      0;
+
+   dw[4] =
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->DualMode, 31, 31) |
+      __gen_field(values->Repel, 30, 30) |
+      __gen_field(values->ColorCountMinusOne, 24, 27) |
+      __gen_field(values->MiddleLoopExtraSteps, 16, 20) |
+      __gen_field(values->LocalMidLoopUnitY, 12, 13) |
+      __gen_field(values->MidLoopUnitX, 8, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->GlobalLoopExecCount, 16, 25) |
+      __gen_field(values->LocalLoopExecCount, 0, 9) |
+      0;
+
+   dw[8] =
+      __gen_field(values->BlockResolutionY, 16, 24) |
+      __gen_field(values->BlockResolutionX, 0, 8) |
+      0;
+
+   dw[9] =
+      __gen_field(values->LocalStartY, 16, 24) |
+      __gen_field(values->LocalStartX, 0, 8) |
+      0;
+
+   dw[10] =
+      __gen_field(values->LocalEndY, 16, 24) |
+      __gen_field(values->LocalEndX, 0, 8) |
+      0;
+
+   dw[11] =
+      __gen_field(values->LocalOuterLoopStrideY, 16, 25) |
+      __gen_field(values->LocalOuterLoopStrideX, 0, 9) |
+      0;
+
+   dw[12] =
+      __gen_field(values->LocalInnerLoopUnitY, 16, 25) |
+      __gen_field(values->LocalInnerLoopUnitX, 0, 9) |
+      0;
+
+   dw[13] =
+      __gen_field(values->GlobalResolutionY, 16, 24) |
+      __gen_field(values->GlobalResolutionX, 0, 8) |
+      0;
+
+   dw[14] =
+      __gen_field(values->GlobalStartY, 16, 25) |
+      __gen_field(values->GlobalStartX, 0, 9) |
+      0;
+
+   dw[15] =
+      __gen_field(values->GlobalOuterLoopStrideY, 16, 25) |
+      __gen_field(values->GlobalOuterLoopStrideX, 0, 9) |
+      0;
+
+   dw[16] =
+      __gen_field(values->GlobalInnerLoopUnitY, 16, 25) |
+      __gen_field(values->GlobalInnerLoopUnitX, 0, 9) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN7_MEDIA_STATE_FLUSH_length_bias 0x00000002
+#define GEN7_MEDIA_STATE_FLUSH_header           \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  4,                  \
+   .DwordLength          =  0
+
+#define GEN7_MEDIA_STATE_FLUSH_length 0x00000002
+
+struct GEN7_MEDIA_STATE_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     WatermarkRequired;
+   uint32_t                                     InterfaceDescriptorOffset;
+};
+
+static inline void
+GEN7_MEDIA_STATE_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN7_MEDIA_STATE_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->WatermarkRequired, 6, 6) |
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+}
+
+#define GEN7_MEDIA_VFE_STATE_length_bias 0x00000002
+#define GEN7_MEDIA_VFE_STATE_header             \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  0,                  \
+   .DwordLength          =  6
+
+#define GEN7_MEDIA_VFE_STATE_length 0x00000008
+
+struct GEN7_MEDIA_VFE_STATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     NumberofURBEntries;
+#define     Maintainingtheexistingtimestampstate               0
+#define     Resettingrelativetimerandlatchingtheglobaltimestamp       1
+   uint32_t                                     ResetGatewayTimer;
+#define     MaintainingOpenGatewayForwardMsgCloseGatewayprotocollegacymode       0
+#define     BypassingOpenGatewayCloseGatewayprotocol           1
+   uint32_t                                     BypassGatewayControl;
+#define     NoMMIOreadwriteallowed                             0
+#define     MMIOreadwritetoanyaddress                          2
+   uint32_t                                     GatewayMMIOAccessControl;
+   uint32_t                                     GPGPUMode;
+   uint32_t                                     URBEntryAllocationSize;
+   uint32_t                                     CURBEAllocationSize;
+#define     Scoreboarddisabled                                 0
+#define     Scoreboardenabled                                  1
+   uint32_t                                     ScoreboardEnable;
+#define     StallingScoreboard                                 0
+#define     NonStallingScoreboard                              1
+   uint32_t                                     ScoreboardType;
+   uint32_t                                     ScoreboardMask;
+   uint32_t                                     Scoreboard3DeltaY;
+   uint32_t                                     Scoreboard3DeltaX;
+   uint32_t                                     Scoreboard2DeltaY;
+   uint32_t                                     Scoreboard2DeltaX;
+   uint32_t                                     Scoreboard1DeltaY;
+   uint32_t                                     Scoreboard1DeltaX;
+   uint32_t                                     Scoreboard0DeltaY;
+   uint32_t                                     Scoreboard0DeltaX;
+   uint32_t                                     Scoreboard7DeltaY;
+   uint32_t                                     Scoreboard7DeltaX;
+   uint32_t                                     Scoreboard6DeltaY;
+   uint32_t                                     Scoreboard6DeltaX;
+   uint32_t                                     Scoreboard5DeltaY;
+   uint32_t                                     Scoreboard5DeltaX;
+   uint32_t                                     Scoreboard4DeltaY;
+   uint32_t                                     Scoreboard4DeltaX;
+};
+
+static inline void
+GEN7_MEDIA_VFE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN7_MEDIA_VFE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->MaximumNumberofThreads, 16, 31) |
+      __gen_field(values->NumberofURBEntries, 8, 15) |
+      __gen_field(values->ResetGatewayTimer, 7, 7) |
+      __gen_field(values->BypassGatewayControl, 6, 6) |
+      __gen_field(values->GatewayMMIOAccessControl, 3, 4) |
+      __gen_field(values->GPGPUMode, 2, 2) |
+      0;
+
+   dw[3] =
+      0;
+
+   dw[4] =
+      __gen_field(values->URBEntryAllocationSize, 16, 31) |
+      __gen_field(values->CURBEAllocationSize, 0, 15) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardEnable, 31, 31) |
+      __gen_field(values->ScoreboardType, 30, 30) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->Scoreboard3DeltaY, 28, 31) |
+      __gen_field(values->Scoreboard3DeltaX, 24, 27) |
+      __gen_field(values->Scoreboard2DeltaY, 20, 23) |
+      __gen_field(values->Scoreboard2DeltaX, 16, 19) |
+      __gen_field(values->Scoreboard1DeltaY, 12, 15) |
+      __gen_field(values->Scoreboard1DeltaX, 8, 11) |
+      __gen_field(values->Scoreboard0DeltaY, 4, 7) |
+      __gen_field(values->Scoreboard0DeltaX, 0, 3) |
+      0;
+
+   dw[7] =
+      __gen_field(values->Scoreboard7DeltaY, 28, 31) |
+      __gen_field(values->Scoreboard7DeltaX, 24, 27) |
+      __gen_field(values->Scoreboard6DeltaY, 20, 23) |
+      __gen_field(values->Scoreboard6DeltaX, 16, 19) |
+      __gen_field(values->Scoreboard5DeltaY, 12, 15) |
+      __gen_field(values->Scoreboard5DeltaX, 8, 11) |
+      __gen_field(values->Scoreboard4DeltaY, 4, 7) |
+      __gen_field(values->Scoreboard4DeltaX, 0, 3) |
+      0;
+
+}
+
+#define GEN7_MI_ARB_CHECK_length_bias 0x00000001
+#define GEN7_MI_ARB_CHECK_header                \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  5
+
+#define GEN7_MI_ARB_CHECK_length 0x00000001
+
+struct GEN7_MI_ARB_CHECK {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN7_MI_ARB_CHECK_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_MI_ARB_CHECK * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN7_MI_ARB_ON_OFF_length_bias 0x00000001
+#define GEN7_MI_ARB_ON_OFF_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  8
+
+#define GEN7_MI_ARB_ON_OFF_length 0x00000001
+
+struct GEN7_MI_ARB_ON_OFF {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         ArbitrationEnable;
+};
+
+static inline void
+GEN7_MI_ARB_ON_OFF_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN7_MI_ARB_ON_OFF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ArbitrationEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN7_MI_BATCH_BUFFER_END_length_bias 0x00000001
+#define GEN7_MI_BATCH_BUFFER_END_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 10
+
+#define GEN7_MI_BATCH_BUFFER_END_length 0x00000001
+
+struct GEN7_MI_BATCH_BUFFER_END {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN7_MI_BATCH_BUFFER_END_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_MI_BATCH_BUFFER_END * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN7_MI_BATCH_BUFFER_START_length_bias 0x00000002
+#define GEN7_MI_BATCH_BUFFER_START_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 49,                  \
+   .DwordLength          =  0
+
+#define GEN7_MI_BATCH_BUFFER_START_length 0x00000002
+
+struct GEN7_MI_BATCH_BUFFER_START {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         ClearCommandBufferEnable;
+#define     ASI_GGTT                                           0
+#define     ASI_PPGTT                                          1
+   uint32_t                                     AddressSpaceIndicator;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BatchBufferStartAddress;
+};
+
+static inline void
+GEN7_MI_BATCH_BUFFER_START_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN7_MI_BATCH_BUFFER_START * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ClearCommandBufferEnable, 11, 11) |
+      __gen_field(values->AddressSpaceIndicator, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->BatchBufferStartAddress, dw1);
+
+}
+
+#define GEN7_MI_CLFLUSH_length_bias 0x00000002
+#define GEN7_MI_CLFLUSH_header                  \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 39
+
+#define GEN7_MI_CLFLUSH_length 0x00000000
+
+struct GEN7_MI_CLFLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           PageBaseAddress;
+   uint32_t                                     StartingCachelineOffset;
+   __gen_address_type                           PageBaseAddressHigh;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_MI_CLFLUSH_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN7_MI_CLFLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 9) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->StartingCachelineOffset, 6, 11) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->PageBaseAddress, dw1);
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->PageBaseAddressHigh, dw2);
+
+   /* variable length fields follow */
+}
+
+#define GEN7_MI_CONDITIONAL_BATCH_BUFFER_END_length_bias 0x00000002
+#define GEN7_MI_CONDITIONAL_BATCH_BUFFER_END_header\
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 54,                  \
+   .UseGlobalGTT         =  0,                  \
+   .CompareSemaphore     =  0,                  \
+   .DwordLength          =  0
+
+#define GEN7_MI_CONDITIONAL_BATCH_BUFFER_END_length 0x00000002
+
+struct GEN7_MI_CONDITIONAL_BATCH_BUFFER_END {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     CompareSemaphore;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CompareDataDword;
+   __gen_address_type                           CompareAddress;
+};
+
+static inline void
+GEN7_MI_CONDITIONAL_BATCH_BUFFER_END_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN7_MI_CONDITIONAL_BATCH_BUFFER_END * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->CompareSemaphore, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->CompareDataDword, 0, 31) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->CompareAddress, dw2);
+
+}
+
+#define GEN7_MI_FLUSH_length_bias 0x00000001
+#define GEN7_MI_FLUSH_header                    \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  4
+
+#define GEN7_MI_FLUSH_length 0x00000001
+
+struct GEN7_MI_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         IndirectStatePointersDisable;
+   bool                                         GenericMediaStateClear;
+#define     DontReset                                          0
+#define     Reset                                              1
+   bool                                         GlobalSnapshotCountReset;
+#define     Flush                                              0
+#define     DontFlush                                          1
+   bool                                         RenderCacheFlushInhibit;
+#define     DontInvalidate                                     0
+#define     Invalidate                                         1
+   bool                                         StateInstructionCacheInvalidate;
+};
+
+static inline void
+GEN7_MI_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                   const struct GEN7_MI_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->IndirectStatePointersDisable, 5, 5) |
+      __gen_field(values->GenericMediaStateClear, 4, 4) |
+      __gen_field(values->GlobalSnapshotCountReset, 3, 3) |
+      __gen_field(values->RenderCacheFlushInhibit, 2, 2) |
+      __gen_field(values->StateInstructionCacheInvalidate, 1, 1) |
+      0;
+
+}
+
+#define GEN7_MI_LOAD_REGISTER_IMM_length_bias 0x00000002
+#define GEN7_MI_LOAD_REGISTER_IMM_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 34,                  \
+   .DwordLength          =  1
+
+#define GEN7_MI_LOAD_REGISTER_IMM_length 0x00000003
+
+struct GEN7_MI_LOAD_REGISTER_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     ByteWriteDisables;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterOffset;
+   uint32_t                                     DataDWord;
+};
+
+static inline void
+GEN7_MI_LOAD_REGISTER_IMM_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_MI_LOAD_REGISTER_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ByteWriteDisables, 8, 11) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterOffset, 2, 22) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DataDWord, 0, 31) |
+      0;
+
+}
+
+#define GEN7_MI_LOAD_REGISTER_MEM_length_bias 0x00000002
+#define GEN7_MI_LOAD_REGISTER_MEM_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 41,                  \
+   .DwordLength          =  1
+
+#define GEN7_MI_LOAD_REGISTER_MEM_length 0x00000003
+
+struct GEN7_MI_LOAD_REGISTER_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     AsyncModeEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN7_MI_LOAD_REGISTER_MEM_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_MI_LOAD_REGISTER_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->AsyncModeEnable, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterAddress, 2, 22) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+}
+
+#define GEN7_MI_NOOP_length_bias 0x00000001
+#define GEN7_MI_NOOP_header                     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  0
+
+#define GEN7_MI_NOOP_length 0x00000001
+
+struct GEN7_MI_NOOP {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         IdentificationNumberRegisterWriteEnable;
+   uint32_t                                     IdentificationNumber;
+};
+
+static inline void
+GEN7_MI_NOOP_pack(__gen_user_data *data, void * restrict dst,
+                  const struct GEN7_MI_NOOP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->IdentificationNumberRegisterWriteEnable, 22, 22) |
+      __gen_field(values->IdentificationNumber, 0, 21) |
+      0;
+
+}
+
+#define GEN7_MI_PREDICATE_length_bias 0x00000001
+#define GEN7_MI_PREDICATE_header                \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 12
+
+#define GEN7_MI_PREDICATE_length 0x00000001
+
+struct GEN7_MI_PREDICATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     LOAD_KEEP                                          0
+#define     LOAD_LOAD                                          2
+#define     LOAD_LOADINV                                       3
+   uint32_t                                     LoadOperation;
+#define     COMBINE_SET                                        0
+#define     COMBINE_AND                                        1
+#define     COMBINE_OR                                         2
+#define     COMBINE_XOR                                        3
+   uint32_t                                     CombineOperation;
+#define     COMPARE_SRCS_EQUAL                                 2
+#define     COMPARE_DELTAS_EQUAL                               3
+   uint32_t                                     CompareOperation;
+};
+
+static inline void
+GEN7_MI_PREDICATE_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_MI_PREDICATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->LoadOperation, 6, 7) |
+      __gen_field(values->CombineOperation, 3, 4) |
+      __gen_field(values->CompareOperation, 0, 1) |
+      0;
+
+}
+
+#define GEN7_MI_REPORT_HEAD_length_bias 0x00000001
+#define GEN7_MI_REPORT_HEAD_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  7
+
+#define GEN7_MI_REPORT_HEAD_length 0x00000001
+
+struct GEN7_MI_REPORT_HEAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN7_MI_REPORT_HEAD_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN7_MI_REPORT_HEAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN7_MI_SEMAPHORE_MBOX_length_bias 0x00000002
+#define GEN7_MI_SEMAPHORE_MBOX_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 22,                  \
+   .DwordLength          =  1
+
+#define GEN7_MI_SEMAPHORE_MBOX_length 0x00000003
+
+struct GEN7_MI_SEMAPHORE_MBOX {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     RVSYNC                                             0
+#define     RBSYNC                                             2
+#define     UseGeneralRegisterSelect                           3
+   uint32_t                                     RegisterSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SemaphoreDataDword;
+};
+
+static inline void
+GEN7_MI_SEMAPHORE_MBOX_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN7_MI_SEMAPHORE_MBOX * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->RegisterSelect, 16, 17) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SemaphoreDataDword, 0, 31) |
+      0;
+
+   dw[2] =
+      0;
+
+}
+
+#define GEN7_MI_SET_CONTEXT_length_bias 0x00000002
+#define GEN7_MI_SET_CONTEXT_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 24,                  \
+   .DwordLength          =  0
+
+#define GEN7_MI_SET_CONTEXT_length 0x00000002
+
+struct GEN7_MI_SET_CONTEXT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           LogicalContextAddress;
+   uint32_t                                     ReservedMustbe1;
+   bool                                         ExtendedStateSaveEnable;
+   bool                                         ExtendedStateRestoreEnable;
+   uint32_t                                     ForceRestore;
+   uint32_t                                     RestoreInhibit;
+};
+
+static inline void
+GEN7_MI_SET_CONTEXT_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN7_MI_SET_CONTEXT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->ReservedMustbe1, 8, 8) |
+      __gen_field(values->ExtendedStateSaveEnable, 3, 3) |
+      __gen_field(values->ExtendedStateRestoreEnable, 2, 2) |
+      __gen_field(values->ForceRestore, 1, 1) |
+      __gen_field(values->RestoreInhibit, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->LogicalContextAddress, dw1);
+
+}
+
+#define GEN7_MI_STORE_DATA_IMM_length_bias 0x00000002
+#define GEN7_MI_STORE_DATA_IMM_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 32,                  \
+   .DwordLength          =  2
+
+#define GEN7_MI_STORE_DATA_IMM_length 0x00000004
+
+struct GEN7_MI_STORE_DATA_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   uint32_t                                     Address;
+   uint32_t                                     CoreModeEnable;
+   uint32_t                                     DataDWord0;
+   uint32_t                                     DataDWord1;
+};
+
+static inline void
+GEN7_MI_STORE_DATA_IMM_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN7_MI_STORE_DATA_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->Address, 2, 31) |
+      __gen_field(values->CoreModeEnable, 0, 0) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->DataDWord1, 0, 31) |
+      0;
+
+}
+
+#define GEN7_MI_STORE_DATA_INDEX_length_bias 0x00000002
+#define GEN7_MI_STORE_DATA_INDEX_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 33,                  \
+   .DwordLength          =  1
+
+#define GEN7_MI_STORE_DATA_INDEX_length 0x00000003
+
+struct GEN7_MI_STORE_DATA_INDEX {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     Offset;
+   uint32_t                                     DataDWord0;
+   uint32_t                                     DataDWord1;
+};
+
+static inline void
+GEN7_MI_STORE_DATA_INDEX_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_MI_STORE_DATA_INDEX * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Offset, 2, 11) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DataDWord1, 0, 31) |
+      0;
+
+}
+
+#define GEN7_MI_SUSPEND_FLUSH_length_bias 0x00000001
+#define GEN7_MI_SUSPEND_FLUSH_header            \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 11
+
+#define GEN7_MI_SUSPEND_FLUSH_length 0x00000001
+
+struct GEN7_MI_SUSPEND_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         SuspendFlush;
+};
+
+static inline void
+GEN7_MI_SUSPEND_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN7_MI_SUSPEND_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->SuspendFlush, 0, 0) |
+      0;
+
+}
+
+#define GEN7_MI_TOPOLOGY_FILTER_length_bias 0x00000001
+#define GEN7_MI_TOPOLOGY_FILTER_header          \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 13
+
+#define GEN7_MI_TOPOLOGY_FILTER_length 0x00000001
+
+struct GEN7_MI_TOPOLOGY_FILTER {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     TopologyFilterValue;
+};
+
+static inline void
+GEN7_MI_TOPOLOGY_FILTER_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN7_MI_TOPOLOGY_FILTER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->TopologyFilterValue, 0, 5) |
+      0;
+
+}
+
+#define GEN7_MI_UPDATE_GTT_length_bias 0x00000002
+#define GEN7_MI_UPDATE_GTT_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 35
+
+#define GEN7_MI_UPDATE_GTT_length 0x00000000
+
+struct GEN7_MI_UPDATE_GTT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           EntryAddress;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN7_MI_UPDATE_GTT_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN7_MI_UPDATE_GTT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->EntryAddress, dw1);
+
+   /* variable length fields follow */
+}
+
+#define GEN7_MI_URB_CLEAR_length_bias 0x00000002
+#define GEN7_MI_URB_CLEAR_header                \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 25,                  \
+   .DwordLength          =  0
+
+#define GEN7_MI_URB_CLEAR_length 0x00000002
+
+struct GEN7_MI_URB_CLEAR {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBClearLength;
+   uint32_t                                     URBAddress;
+};
+
+static inline void
+GEN7_MI_URB_CLEAR_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_MI_URB_CLEAR * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBClearLength, 16, 28) |
+      __gen_offset(values->URBAddress, 0, 13) |
+      0;
+
+}
+
+#define GEN7_MI_USER_INTERRUPT_length_bias 0x00000001
+#define GEN7_MI_USER_INTERRUPT_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  2
+
+#define GEN7_MI_USER_INTERRUPT_length 0x00000001
+
+struct GEN7_MI_USER_INTERRUPT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN7_MI_USER_INTERRUPT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN7_MI_USER_INTERRUPT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN7_MI_WAIT_FOR_EVENT_length_bias 0x00000001
+#define GEN7_MI_WAIT_FOR_EVENT_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  3
+
+#define GEN7_MI_WAIT_FOR_EVENT_length 0x00000001
+
+struct GEN7_MI_WAIT_FOR_EVENT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         DisplayPipeCHorizontalBlankWaitEnable;
+   bool                                         DisplayPipeCVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteCFlipPendingWaitEnable;
+#define     Notenabled                                         0
+   uint32_t                                     ConditionCodeWaitSelect;
+   bool                                         DisplayPlaneCFlipPendingWaitEnable;
+   bool                                         DisplayPipeCScanLineWaitEnable;
+   bool                                         DisplayPipeBHorizontalBlankWaitEnable;
+   bool                                         DisplayPipeBVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteBFlipPendingWaitEnable;
+   bool                                         DisplayPlaneBFlipPendingWaitEnable;
+   bool                                         DisplayPipeBScanLineWaitEnable;
+   bool                                         DisplayPipeAHorizontalBlankWaitEnable;
+   bool                                         DisplayPipeAVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteAFlipPendingWaitEnable;
+   bool                                         DisplayPlaneAFlipPendingWaitEnable;
+   bool                                         DisplayPipeAScanLineWaitEnable;
+};
+
+static inline void
+GEN7_MI_WAIT_FOR_EVENT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN7_MI_WAIT_FOR_EVENT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPipeCHorizontalBlankWaitEnable, 22, 22) |
+      __gen_field(values->DisplayPipeCVerticalBlankWaitEnable, 21, 21) |
+      __gen_field(values->DisplaySpriteCFlipPendingWaitEnable, 20, 20) |
+      __gen_field(values->ConditionCodeWaitSelect, 16, 19) |
+      __gen_field(values->DisplayPlaneCFlipPendingWaitEnable, 15, 15) |
+      __gen_field(values->DisplayPipeCScanLineWaitEnable, 14, 14) |
+      __gen_field(values->DisplayPipeBHorizontalBlankWaitEnable, 13, 13) |
+      __gen_field(values->DisplayPipeBVerticalBlankWaitEnable, 11, 11) |
+      __gen_field(values->DisplaySpriteBFlipPendingWaitEnable, 10, 10) |
+      __gen_field(values->DisplayPlaneBFlipPendingWaitEnable, 9, 9) |
+      __gen_field(values->DisplayPipeBScanLineWaitEnable, 8, 8) |
+      __gen_field(values->DisplayPipeAHorizontalBlankWaitEnable, 5, 5) |
+      __gen_field(values->DisplayPipeAVerticalBlankWaitEnable, 3, 3) |
+      __gen_field(values->DisplaySpriteAFlipPendingWaitEnable, 2, 2) |
+      __gen_field(values->DisplayPlaneAFlipPendingWaitEnable, 1, 1) |
+      __gen_field(values->DisplayPipeAScanLineWaitEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN7_PIPE_CONTROL_length_bias 0x00000002
+#define GEN7_PIPE_CONTROL_header                \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  2,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  3
+
+#define GEN7_PIPE_CONTROL_length 0x00000005
+
+struct GEN7_PIPE_CONTROL {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     DAT_PPGTT                                          0
+#define     DAT_GGTT                                           1
+   uint32_t                                     DestinationAddressType;
+#define     NoLRIOperation                                     0
+#define     MMIOWriteImmediateData                             1
+   uint32_t                                     LRIPostSyncOperation;
+   uint32_t                                     StoreDataIndex;
+   uint32_t                                     CommandStreamerStallEnable;
+#define     DontReset                                          0
+#define     Reset                                              1
+   uint32_t                                     GlobalSnapshotCountReset;
+   uint32_t                                     TLBInvalidate;
+   bool                                         GenericMediaStateClear;
+#define     NoWrite                                            0
+#define     WriteImmediateData                                 1
+#define     WritePSDepthCount                                  2
+#define     WriteTimestamp                                     3
+   uint32_t                                     PostSyncOperation;
+   bool                                         DepthStallEnable;
+#define     DisableFlush                                       0
+#define     EnableFlush                                        1
+   bool                                         RenderTargetCacheFlushEnable;
+   bool                                         InstructionCacheInvalidateEnable;
+   bool                                         TextureCacheInvalidationEnable;
+   bool                                         IndirectStatePointersDisable;
+   bool                                         NotifyEnable;
+   bool                                         PipeControlFlushEnable;
+   bool                                         DCFlushEnable;
+   bool                                         VFCacheInvalidationEnable;
+   bool                                         ConstantCacheInvalidationEnable;
+   bool                                         StateCacheInvalidationEnable;
+   bool                                         StallAtPixelScoreboard;
+#define     FlushDisabled                                      0
+#define     FlushEnabled                                       1
+   bool                                         DepthCacheFlushEnable;
+   __gen_address_type                           Address;
+   uint32_t                                     ImmediateData;
+   uint32_t                                     ImmediateData0;
+};
+
+static inline void
+GEN7_PIPE_CONTROL_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_PIPE_CONTROL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DestinationAddressType, 24, 24) |
+      __gen_field(values->LRIPostSyncOperation, 23, 23) |
+      __gen_field(values->StoreDataIndex, 21, 21) |
+      __gen_field(values->CommandStreamerStallEnable, 20, 20) |
+      __gen_field(values->GlobalSnapshotCountReset, 19, 19) |
+      __gen_field(values->TLBInvalidate, 18, 18) |
+      __gen_field(values->GenericMediaStateClear, 16, 16) |
+      __gen_field(values->PostSyncOperation, 14, 15) |
+      __gen_field(values->DepthStallEnable, 13, 13) |
+      __gen_field(values->RenderTargetCacheFlushEnable, 12, 12) |
+      __gen_field(values->InstructionCacheInvalidateEnable, 11, 11) |
+      __gen_field(values->TextureCacheInvalidationEnable, 10, 10) |
+      __gen_field(values->IndirectStatePointersDisable, 9, 9) |
+      __gen_field(values->NotifyEnable, 8, 8) |
+      __gen_field(values->PipeControlFlushEnable, 7, 7) |
+      __gen_field(values->DCFlushEnable, 5, 5) |
+      __gen_field(values->VFCacheInvalidationEnable, 4, 4) |
+      __gen_field(values->ConstantCacheInvalidationEnable, 3, 3) |
+      __gen_field(values->StateCacheInvalidationEnable, 2, 2) |
+      __gen_field(values->StallAtPixelScoreboard, 1, 1) |
+      __gen_field(values->DepthCacheFlushEnable, 0, 0) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->Address, dw2);
+
+   dw[3] =
+      __gen_field(values->ImmediateData, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->ImmediateData, 0, 31) |
+      0;
+
+}
+
+#define GEN7_SCISSOR_RECT_length 0x00000002
+
+struct GEN7_SCISSOR_RECT {
+   uint32_t                                     ScissorRectangleYMin;
+   uint32_t                                     ScissorRectangleXMin;
+   uint32_t                                     ScissorRectangleYMax;
+   uint32_t                                     ScissorRectangleXMax;
+};
+
+static inline void
+GEN7_SCISSOR_RECT_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN7_SCISSOR_RECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ScissorRectangleYMin, 16, 31) |
+      __gen_field(values->ScissorRectangleXMin, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ScissorRectangleYMax, 16, 31) |
+      __gen_field(values->ScissorRectangleXMax, 0, 15) |
+      0;
+
+}
+
+#define GEN7_SF_CLIP_VIEWPORT_length 0x00000010
+
+struct GEN7_SF_CLIP_VIEWPORT {
+   float                                        ViewportMatrixElementm00;
+   float                                        ViewportMatrixElementm11;
+   float                                        ViewportMatrixElementm22;
+   float                                        ViewportMatrixElementm30;
+   float                                        ViewportMatrixElementm31;
+   float                                        ViewportMatrixElementm32;
+   float                                        XMinClipGuardband;
+   float                                        XMaxClipGuardband;
+   float                                        YMinClipGuardband;
+   float                                        YMaxClipGuardband;
+};
+
+static inline void
+GEN7_SF_CLIP_VIEWPORT_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN7_SF_CLIP_VIEWPORT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->ViewportMatrixElementm00) |
+      0;
+
+   dw[1] =
+      __gen_float(values->ViewportMatrixElementm11) |
+      0;
+
+   dw[2] =
+      __gen_float(values->ViewportMatrixElementm22) |
+      0;
+
+   dw[3] =
+      __gen_float(values->ViewportMatrixElementm30) |
+      0;
+
+   dw[4] =
+      __gen_float(values->ViewportMatrixElementm31) |
+      0;
+
+   dw[5] =
+      __gen_float(values->ViewportMatrixElementm32) |
+      0;
+
+   dw[6] =
+      0;
+
+   dw[7] =
+      0;
+
+   dw[8] =
+      __gen_float(values->XMinClipGuardband) |
+      0;
+
+   dw[9] =
+      __gen_float(values->XMaxClipGuardband) |
+      0;
+
+   dw[10] =
+      __gen_float(values->YMinClipGuardband) |
+      0;
+
+   dw[11] =
+      __gen_float(values->YMaxClipGuardband) |
+      0;
+
+   for (uint32_t i = 0, j = 12; i < 4; i += 1, j++) {
+      dw[j] =
+         0;
+   }
+
+}
+
+#define GEN7_BLEND_STATE_length 0x00000002
+
+struct GEN7_BLEND_STATE {
+   bool                                         ColorBufferBlendEnable;
+   bool                                         IndependentAlphaBlendEnable;
+#define     BLENDFUNCTION_ADD                                  0
+#define     BLENDFUNCTION_SUBTRACT                             1
+#define     BLENDFUNCTION_REVERSE_SUBTRACT                     2
+#define     BLENDFUNCTION_MIN                                  3
+#define     BLENDFUNCTION_MAX                                  4
+   uint32_t                                     AlphaBlendFunction;
+#define     BLENDFACTOR_ONE                                    1
+#define     BLENDFACTOR_SRC_COLOR                              2
+#define     BLENDFACTOR_SRC_ALPHA                              3
+#define     BLENDFACTOR_DST_ALPHA                              4
+#define     BLENDFACTOR_DST_COLOR                              5
+#define     BLENDFACTOR_SRC_ALPHA_SATURATE                     6
+#define     BLENDFACTOR_CONST_COLOR                            7
+#define     BLENDFACTOR_CONST_ALPHA                            8
+#define     BLENDFACTOR_SRC1_COLOR                             9
+#define     BLENDFACTOR_SRC1_ALPHA                            10
+#define     BLENDFACTOR_ZERO                                  17
+#define     BLENDFACTOR_INV_SRC_COLOR                         18
+#define     BLENDFACTOR_INV_SRC_ALPHA                         19
+#define     BLENDFACTOR_INV_DST_ALPHA                         20
+#define     BLENDFACTOR_INV_DST_COLOR                         21
+#define     BLENDFACTOR_INV_CONST_COLOR                       23
+#define     BLENDFACTOR_INV_CONST_ALPHA                       24
+#define     BLENDFACTOR_INV_SRC1_COLOR                        25
+#define     BLENDFACTOR_INV_SRC1_ALPHA                        26
+   uint32_t                                     SourceAlphaBlendFactor;
+   uint32_t                                     DestinationAlphaBlendFactor;
+#define     BLENDFUNCTION_ADD                                  0
+#define     BLENDFUNCTION_SUBTRACT                             1
+#define     BLENDFUNCTION_REVERSE_SUBTRACT                     2
+#define     BLENDFUNCTION_MIN                                  3
+#define     BLENDFUNCTION_MAX                                  4
+   uint32_t                                     ColorBlendFunction;
+   uint32_t                                     SourceBlendFactor;
+   uint32_t                                     DestinationBlendFactor;
+   bool                                         AlphaToCoverageEnable;
+   bool                                         AlphaToOneEnable;
+   bool                                         AlphaToCoverageDitherEnable;
+   bool                                         WriteDisableAlpha;
+   bool                                         WriteDisableRed;
+   bool                                         WriteDisableGreen;
+   bool                                         WriteDisableBlue;
+   bool                                         LogicOpEnable;
+#define     LOGICOP_CLEAR                                      0
+#define     LOGICOP_NOR                                        1
+#define     LOGICOP_AND_INVERTED                               2
+#define     LOGICOP_COPY_INVERTED                              3
+#define     LOGICOP_AND_REVERSE                                4
+#define     LOGICOP_INVERT                                     5
+#define     LOGICOP_XOR                                        6
+#define     LOGICOP_NAND                                       7
+#define     LOGICOP_AND                                        8
+#define     LOGICOP_EQUIV                                      9
+#define     LOGICOP_NOOP                                      10
+#define     LOGICOP_OR_INVERTED                               11
+#define     LOGICOP_COPY                                      12
+#define     LOGICOP_OR_REVERSE                                13
+#define     LOGICOP_OR                                        14
+#define     LOGICOP_SET                                       15
+   uint32_t                                     LogicOpFunction;
+   bool                                         AlphaTestEnable;
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+   uint32_t                                     AlphaTestFunction;
+   bool                                         ColorDitherEnable;
+   uint32_t                                     XDitherOffset;
+   uint32_t                                     YDitherOffset;
+#define     COLORCLAMP_UNORM                                   0
+#define     COLORCLAMP_SNORM                                   1
+#define     COLORCLAMP_RTFORMAT                                2
+   uint32_t                                     ColorClampRange;
+   bool                                         PreBlendColorClampEnable;
+   bool                                         PostBlendColorClampEnable;
+};
+
+static inline void
+GEN7_BLEND_STATE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN7_BLEND_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ColorBufferBlendEnable, 31, 31) |
+      __gen_field(values->IndependentAlphaBlendEnable, 30, 30) |
+      __gen_field(values->AlphaBlendFunction, 26, 28) |
+      __gen_field(values->SourceAlphaBlendFactor, 20, 24) |
+      __gen_field(values->DestinationAlphaBlendFactor, 15, 19) |
+      __gen_field(values->ColorBlendFunction, 11, 13) |
+      __gen_field(values->SourceBlendFactor, 5, 9) |
+      __gen_field(values->DestinationBlendFactor, 0, 4) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AlphaToCoverageEnable, 31, 31) |
+      __gen_field(values->AlphaToOneEnable, 30, 30) |
+      __gen_field(values->AlphaToCoverageDitherEnable, 29, 29) |
+      __gen_field(values->WriteDisableAlpha, 27, 27) |
+      __gen_field(values->WriteDisableRed, 26, 26) |
+      __gen_field(values->WriteDisableGreen, 25, 25) |
+      __gen_field(values->WriteDisableBlue, 24, 24) |
+      __gen_field(values->LogicOpEnable, 22, 22) |
+      __gen_field(values->LogicOpFunction, 18, 21) |
+      __gen_field(values->AlphaTestEnable, 16, 16) |
+      __gen_field(values->AlphaTestFunction, 13, 15) |
+      __gen_field(values->ColorDitherEnable, 12, 12) |
+      __gen_field(values->XDitherOffset, 10, 11) |
+      __gen_field(values->YDitherOffset, 8, 9) |
+      __gen_field(values->ColorClampRange, 2, 3) |
+      __gen_field(values->PreBlendColorClampEnable, 1, 1) |
+      __gen_field(values->PostBlendColorClampEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN7_CC_VIEWPORT_length 0x00000002
+
+struct GEN7_CC_VIEWPORT {
+   float                                        MinimumDepth;
+   float                                        MaximumDepth;
+};
+
+static inline void
+GEN7_CC_VIEWPORT_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN7_CC_VIEWPORT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->MinimumDepth) |
+      0;
+
+   dw[1] =
+      __gen_float(values->MaximumDepth) |
+      0;
+
+}
+
+#define GEN7_COLOR_CALC_STATE_length 0x00000006
+
+struct GEN7_COLOR_CALC_STATE {
+   uint32_t                                     StencilReferenceValue;
+   uint32_t                                     BackFaceStencilReferenceValue;
+#define     Cancelled                                          0
+#define     NotCancelled                                       1
+   uint32_t                                     RoundDisableFunctionDisable;
+#define     ALPHATEST_UNORM8                                   0
+#define     ALPHATEST_FLOAT32                                  1
+   uint32_t                                     AlphaTestFormat;
+   uint32_t                                     AlphaReferenceValueAsUNORM8;
+   float                                        AlphaReferenceValueAsFLOAT32;
+   float                                        BlendConstantColorRed;
+   float                                        BlendConstantColorGreen;
+   float                                        BlendConstantColorBlue;
+   float                                        BlendConstantColorAlpha;
+};
+
+static inline void
+GEN7_COLOR_CALC_STATE_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN7_COLOR_CALC_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->StencilReferenceValue, 24, 31) |
+      __gen_field(values->BackFaceStencilReferenceValue, 16, 23) |
+      __gen_field(values->RoundDisableFunctionDisable, 15, 15) |
+      __gen_field(values->AlphaTestFormat, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AlphaReferenceValueAsUNORM8, 0, 31) |
+      __gen_float(values->AlphaReferenceValueAsFLOAT32) |
+      0;
+
+   dw[2] =
+      __gen_float(values->BlendConstantColorRed) |
+      0;
+
+   dw[3] =
+      __gen_float(values->BlendConstantColorGreen) |
+      0;
+
+   dw[4] =
+      __gen_float(values->BlendConstantColorBlue) |
+      0;
+
+   dw[5] =
+      __gen_float(values->BlendConstantColorAlpha) |
+      0;
+
+}
+
+#define GEN7_DEPTH_STENCIL_STATE_length 0x00000003
+
+struct GEN7_DEPTH_STENCIL_STATE {
+   bool                                         StencilTestEnable;
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+   uint32_t                                     StencilTestFunction;
+#define     STENCILOP_KEEP                                     0
+#define     STENCILOP_ZERO                                     1
+#define     STENCILOP_REPLACE                                  2
+#define     STENCILOP_INCRSAT                                  3
+#define     STENCILOP_DECRSAT                                  4
+#define     STENCILOP_INCR                                     5
+#define     STENCILOP_DECR                                     6
+#define     STENCILOP_INVERT                                   7
+   uint32_t                                     StencilFailOp;
+   uint32_t                                     StencilPassDepthFailOp;
+   uint32_t                                     StencilPassDepthPassOp;
+   bool                                         StencilBufferWriteEnable;
+   bool                                         DoubleSidedStencilEnable;
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+   uint32_t                                     BackFaceStencilTestFunction;
+#define     STENCILOP_KEEP                                     0
+#define     STENCILOP_ZERO                                     1
+#define     STENCILOP_REPLACE                                  2
+#define     STENCILOP_INCRSAT                                  3
+#define     STENCILOP_DECRSAT                                  4
+#define     STENCILOP_INCR                                     5
+#define     STENCILOP_DECR                                     6
+#define     STENCILOP_INVERT                                   7
+   uint32_t                                     BackfaceStencilFailOp;
+   uint32_t                                     BackfaceStencilPassDepthFailOp;
+   uint32_t                                     BackfaceStencilPassDepthPassOp;
+   uint32_t                                     StencilTestMask;
+   uint32_t                                     StencilWriteMask;
+   uint32_t                                     BackfaceStencilTestMask;
+   uint32_t                                     BackfaceStencilWriteMask;
+   bool                                         DepthTestEnable;
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+   uint32_t                                     DepthTestFunction;
+   bool                                         DepthBufferWriteEnable;
+};
+
+static inline void
+GEN7_DEPTH_STENCIL_STATE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_DEPTH_STENCIL_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->StencilTestEnable, 31, 31) |
+      __gen_field(values->StencilTestFunction, 28, 30) |
+      __gen_field(values->StencilFailOp, 25, 27) |
+      __gen_field(values->StencilPassDepthFailOp, 22, 24) |
+      __gen_field(values->StencilPassDepthPassOp, 19, 21) |
+      __gen_field(values->StencilBufferWriteEnable, 18, 18) |
+      __gen_field(values->DoubleSidedStencilEnable, 15, 15) |
+      __gen_field(values->BackFaceStencilTestFunction, 12, 14) |
+      __gen_field(values->BackfaceStencilFailOp, 9, 11) |
+      __gen_field(values->BackfaceStencilPassDepthFailOp, 6, 8) |
+      __gen_field(values->BackfaceStencilPassDepthPassOp, 3, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StencilTestMask, 24, 31) |
+      __gen_field(values->StencilWriteMask, 16, 23) |
+      __gen_field(values->BackfaceStencilTestMask, 8, 15) |
+      __gen_field(values->BackfaceStencilWriteMask, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DepthTestEnable, 31, 31) |
+      __gen_field(values->DepthTestFunction, 27, 29) |
+      __gen_field(values->DepthBufferWriteEnable, 26, 26) |
+      0;
+
+}
+
+#define GEN7_INTERFACE_DESCRIPTOR_DATA_length 0x00000008
+
+struct GEN7_INTERFACE_DESCRIPTOR_DATA {
+   uint32_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleProgramFlow;
+#define     NormalPriority                                     0
+#define     HighPriority                                       1
+   uint32_t                                     ThreadPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     SamplerStatePointer;
+#define     Nosamplersused                                     0
+#define     Between1and4samplersused                           1
+#define     Between5and8samplersused                           2
+#define     Between9and12samplersused                          3
+#define     Between13and16samplersused                         4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTablePointer;
+   uint32_t                                     BindingTableEntryCount;
+   uint32_t                                     ConstantURBEntryReadLength;
+   uint32_t                                     ConstantURBEntryReadOffset;
+#define     RTNE                                               0
+#define     RU                                                 1
+#define     RD                                                 2
+#define     RTZ                                                3
+   uint32_t                                     RoundingMode;
+   bool                                         BarrierEnable;
+   uint32_t                                     SharedLocalMemorySize;
+   uint32_t                                     NumberofThreadsinGPGPUThreadGroup;
+};
+
+static inline void
+GEN7_INTERFACE_DESCRIPTOR_DATA_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN7_INTERFACE_DESCRIPTOR_DATA * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SingleProgramFlow, 18, 18) |
+      __gen_field(values->ThreadPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->SamplerStatePointer, 5, 31) |
+      __gen_field(values->SamplerCount, 2, 4) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->BindingTablePointer, 5, 15) |
+      __gen_field(values->BindingTableEntryCount, 0, 4) |
+      0;
+
+   dw[4] =
+      __gen_field(values->ConstantURBEntryReadLength, 16, 31) |
+      __gen_field(values->ConstantURBEntryReadOffset, 0, 15) |
+      0;
+
+   dw[5] =
+      __gen_field(values->RoundingMode, 22, 23) |
+      __gen_field(values->BarrierEnable, 21, 21) |
+      __gen_field(values->SharedLocalMemorySize, 16, 20) |
+      __gen_field(values->NumberofThreadsinGPGPUThreadGroup, 0, 7) |
+      0;
+
+   dw[6] =
+      0;
+
+   dw[7] =
+      0;
+
+}
+
+#define GEN7_BINDING_TABLE_STATE_length 0x00000001
+
+struct GEN7_BINDING_TABLE_STATE {
+   uint32_t                                     SurfaceStatePointer;
+};
+
+static inline void
+GEN7_BINDING_TABLE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN7_BINDING_TABLE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->SurfaceStatePointer, 5, 31) |
+      0;
+
+}
+
+#define GEN7_RENDER_SURFACE_STATE_length 0x00000008
+
+struct GEN7_RENDER_SURFACE_STATE {
+#define     SURFTYPE_1D                                        0
+#define     SURFTYPE_2D                                        1
+#define     SURFTYPE_3D                                        2
+#define     SURFTYPE_CUBE                                      3
+#define     SURFTYPE_BUFFER                                    4
+#define     SURFTYPE_STRBUF                                    5
+#define     SURFTYPE_NULL                                      7
+   uint32_t                                     SurfaceType;
+   bool                                         SurfaceArray;
+   uint32_t                                     SurfaceFormat;
+#define     VALIGN_2                                           0
+#define     VALIGN_4                                           1
+   uint32_t                                     SurfaceVerticalAlignment;
+#define     HALIGN_4                                           0
+#define     HALIGN_8                                           1
+   uint32_t                                     SurfaceHorizontalAlignment;
+   uint32_t                                     TiledSurface;
+#define     TILEWALK_XMAJOR                                    0
+#define     TILEWALK_YMAJOR                                    1
+   uint32_t                                     TileWalk;
+   uint32_t                                     VerticalLineStride;
+   uint32_t                                     VerticalLineStrideOffset;
+#define     ARYSPC_FULL                                        0
+#define     ARYSPC_LOD0                                        1
+   uint32_t                                     SurfaceArraySpacing;
+   uint32_t                                     RenderCacheReadWriteMode;
+#define     NORMAL_MODE                                        0
+#define     PROGRESSIVE_FRAME                                  2
+#define     INTERLACED_FRAME                                   3
+   uint32_t                                     MediaBoundaryPixelMode;
+   uint32_t                                     CubeFaceEnables;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     Height;
+   uint32_t                                     Width;
+   uint32_t                                     Depth;
+   uint32_t                                     SurfacePitch;
+#define     RTROTATE_0DEG                                      0
+#define     RTROTATE_90DEG                                     1
+#define     RTROTATE_270DEG                                    3
+   uint32_t                                     RenderTargetRotation;
+   uint32_t                                     MinimumArrayElement;
+   uint32_t                                     RenderTargetViewExtent;
+#define     MSFMT_MSS                                          0
+#define     MSFMT_DEPTH_STENCIL                                1
+   uint32_t                                     MultisampledSurfaceStorageFormat;
+#define     MULTISAMPLECOUNT_1                                 0
+#define     MULTISAMPLECOUNT_4                                 2
+#define     MULTISAMPLECOUNT_8                                 3
+   uint32_t                                     NumberofMultisamples;
+   uint32_t                                     MultisamplePositionPaletteIndex;
+   uint32_t                                     MinimumArrayElement0;
+   uint32_t                                     XOffset;
+   uint32_t                                     YOffset;
+   struct GEN7_MEMORY_OBJECT_CONTROL_STATE      SurfaceObjectControlState;
+   uint32_t                                     SurfaceMinLOD;
+   uint32_t                                     MIPCountLOD;
+   __gen_address_type                           MCSBaseAddress;
+   uint32_t                                     MCSSurfacePitch;
+   __gen_address_type                           AppendCounterAddress;
+   bool                                         AppendCounterEnable;
+   bool                                         MCSEnable;
+   uint32_t                                     XOffsetforUVPlane;
+   uint32_t                                     YOffsetforUVPlane;
+#define     CC_ZERO                                            0
+#define     CC_ONE                                             1
+   uint32_t                                     RedClearColor;
+#define     CC_ZERO                                            0
+#define     CC_ONE                                             1
+   uint32_t                                     GreenClearColor;
+#define     CC_ZERO                                            0
+#define     CC_ONE                                             1
+   uint32_t                                     BlueClearColor;
+#define     CC_ZERO                                            0
+#define     CC_ONE                                             1
+   uint32_t                                     AlphaClearColor;
+   float                                        ResourceMinLOD;
+};
+
+static inline void
+GEN7_RENDER_SURFACE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN7_RENDER_SURFACE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->SurfaceType, 29, 31) |
+      __gen_field(values->SurfaceArray, 28, 28) |
+      __gen_field(values->SurfaceFormat, 18, 26) |
+      __gen_field(values->SurfaceVerticalAlignment, 16, 17) |
+      __gen_field(values->SurfaceHorizontalAlignment, 15, 15) |
+      __gen_field(values->TiledSurface, 14, 14) |
+      __gen_field(values->TileWalk, 13, 13) |
+      __gen_field(values->VerticalLineStride, 12, 12) |
+      __gen_field(values->VerticalLineStrideOffset, 11, 11) |
+      __gen_field(values->SurfaceArraySpacing, 10, 10) |
+      __gen_field(values->RenderCacheReadWriteMode, 8, 8) |
+      __gen_field(values->MediaBoundaryPixelMode, 6, 7) |
+      __gen_field(values->CubeFaceEnables, 0, 5) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->SurfaceBaseAddress, dw1);
+
+   dw[2] =
+      __gen_field(values->Height, 16, 29) |
+      __gen_field(values->Width, 0, 13) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Depth, 21, 31) |
+      __gen_field(values->SurfacePitch, 0, 17) |
+      0;
+
+   dw[4] =
+      __gen_field(values->RenderTargetRotation, 29, 30) |
+      __gen_field(values->MinimumArrayElement, 18, 28) |
+      __gen_field(values->RenderTargetViewExtent, 7, 17) |
+      __gen_field(values->MultisampledSurfaceStorageFormat, 6, 6) |
+      __gen_field(values->NumberofMultisamples, 3, 5) |
+      __gen_field(values->MultisamplePositionPaletteIndex, 0, 2) |
+      __gen_field(values->MinimumArrayElement, 0, 26) |
+      0;
+
+   uint32_t dw_SurfaceObjectControlState;
+   GEN7_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceObjectControlState, &values->SurfaceObjectControlState);
+   dw[5] =
+      __gen_offset(values->XOffset, 25, 31) |
+      __gen_offset(values->YOffset, 20, 23) |
+      __gen_field(dw_SurfaceObjectControlState, 16, 19) |
+      __gen_field(values->SurfaceMinLOD, 4, 7) |
+      __gen_field(values->MIPCountLOD, 0, 3) |
+      0;
+
+   uint32_t dw6 =
+      __gen_field(values->MCSSurfacePitch, 3, 11) |
+      __gen_field(values->AppendCounterEnable, 1, 1) |
+      __gen_field(values->MCSEnable, 0, 0) |
+      __gen_field(values->XOffsetforUVPlane, 16, 29) |
+      __gen_field(values->YOffsetforUVPlane, 0, 13) |
+      0;
+
+   dw[6] =
+      __gen_combine_address(data, &dw[6], values->AppendCounterAddress, dw6);
+
+   dw[7] =
+      __gen_field(values->RedClearColor, 31, 31) |
+      __gen_field(values->GreenClearColor, 30, 30) |
+      __gen_field(values->BlueClearColor, 29, 29) |
+      __gen_field(values->AlphaClearColor, 28, 28) |
+      __gen_field(values->ResourceMinLOD * (1 << 8), 0, 11) |
+      0;
+
+}
+
+#define GEN7_SAMPLER_BORDER_COLOR_STATE_length 0x00000004
+
+struct GEN7_SAMPLER_BORDER_COLOR_STATE {
+   float                                        BorderColorRedDX100GL;
+   uint32_t                                     BorderColorAlpha;
+   uint32_t                                     BorderColorBlue;
+   uint32_t                                     BorderColorGreen;
+   uint32_t                                     BorderColorRedDX9;
+   float                                        BorderColorGreen0;
+   float                                        BorderColorBlue0;
+   float                                        BorderColorAlpha0;
+};
+
+static inline void
+GEN7_SAMPLER_BORDER_COLOR_STATE_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN7_SAMPLER_BORDER_COLOR_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->BorderColorRedDX100GL) |
+      __gen_field(values->BorderColorAlpha, 24, 31) |
+      __gen_field(values->BorderColorBlue, 16, 23) |
+      __gen_field(values->BorderColorGreen, 8, 15) |
+      __gen_field(values->BorderColorRedDX9, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_float(values->BorderColorGreen) |
+      0;
+
+   dw[2] =
+      __gen_float(values->BorderColorBlue) |
+      0;
+
+   dw[3] =
+      __gen_float(values->BorderColorAlpha) |
+      0;
+
+}
+
+#define GEN7_SAMPLER_STATE_length 0x00000004
+
+struct GEN7_SAMPLER_STATE {
+   bool                                         SamplerDisable;
+#define     DX10OGL                                            0
+#define     DX9                                                1
+   uint32_t                                     TextureBorderColorMode;
+#define     OGL                                                1
+   uint32_t                                     LODPreClampEnable;
+   float                                        BaseMipLevel;
+#define     MIPFILTER_NONE                                     0
+#define     MIPFILTER_NEAREST                                  1
+#define     MIPFILTER_LINEAR                                   3
+   uint32_t                                     MipModeFilter;
+#define     MAPFILTER_NEAREST                                  0
+#define     MAPFILTER_LINEAR                                   1
+#define     MAPFILTER_ANISOTROPIC                              2
+#define     MAPFILTER_MONO                                     6
+   uint32_t                                     MagModeFilter;
+#define     MAPFILTER_NEAREST                                  0
+#define     MAPFILTER_LINEAR                                   1
+#define     MAPFILTER_ANISOTROPIC                              2
+#define     MAPFILTER_MONO                                     6
+   uint32_t                                     MinModeFilter;
+   float                                        TextureLODBias;
+#define     LEGACY                                             0
+#define     EWAApproximation                                   1
+   uint32_t                                     AnisotropicAlgorithm;
+   float                                        MinLOD;
+   float                                        MaxLOD;
+#define     PREFILTEROPALWAYS                                  0
+#define     PREFILTEROPNEVER                                   1
+#define     PREFILTEROPLESS                                    2
+#define     PREFILTEROPEQUAL                                   3
+#define     PREFILTEROPLEQUAL                                  4
+#define     PREFILTEROPGREATER                                 5
+#define     PREFILTEROPNOTEQUAL                                6
+#define     PREFILTEROPGEQUAL                                  7
+   uint32_t                                     ShadowFunction;
+#define     PROGRAMMED                                         0
+#define     OVERRIDE                                           1
+   uint32_t                                     CubeSurfaceControlMode;
+   uint32_t                                     BorderColorPointer;
+   bool                                         ChromaKeyEnable;
+   uint32_t                                     ChromaKeyIndex;
+#define     KEYFILTER_KILL_ON_ANY_MATCH                        0
+#define     KEYFILTER_REPLACE_BLACK                            1
+   uint32_t                                     ChromaKeyMode;
+#define     RATIO21                                            0
+#define     RATIO41                                            1
+#define     RATIO61                                            2
+#define     RATIO81                                            3
+#define     RATIO101                                           4
+#define     RATIO121                                           5
+#define     RATIO141                                           6
+#define     RATIO161                                           7
+   uint32_t                                     MaximumAnisotropy;
+   bool                                         RAddressMinFilterRoundingEnable;
+   bool                                         RAddressMagFilterRoundingEnable;
+   bool                                         VAddressMinFilterRoundingEnable;
+   bool                                         VAddressMagFilterRoundingEnable;
+   bool                                         UAddressMinFilterRoundingEnable;
+   bool                                         UAddressMagFilterRoundingEnable;
+#define     FULL                                               0
+#define     MED                                                2
+#define     LOW                                                3
+   uint32_t                                     TrilinearFilterQuality;
+   bool                                         NonnormalizedCoordinateEnable;
+   uint32_t                                     TCXAddressControlMode;
+   uint32_t                                     TCYAddressControlMode;
+   uint32_t                                     TCZAddressControlMode;
+};
+
+static inline void
+GEN7_SAMPLER_STATE_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN7_SAMPLER_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->SamplerDisable, 31, 31) |
+      __gen_field(values->TextureBorderColorMode, 29, 29) |
+      __gen_field(values->LODPreClampEnable, 28, 28) |
+      __gen_field(values->BaseMipLevel * (1 << 1), 22, 26) |
+      __gen_field(values->MipModeFilter, 20, 21) |
+      __gen_field(values->MagModeFilter, 17, 19) |
+      __gen_field(values->MinModeFilter, 14, 16) |
+      __gen_fixed(values->TextureLODBias, 1, 13, true, 8) |
+      __gen_field(values->AnisotropicAlgorithm, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MinLOD * (1 << 8), 20, 31) |
+      __gen_field(values->MaxLOD * (1 << 8), 8, 19) |
+      __gen_field(values->ShadowFunction, 1, 3) |
+      __gen_field(values->CubeSurfaceControlMode, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->BorderColorPointer, 5, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ChromaKeyEnable, 25, 25) |
+      __gen_field(values->ChromaKeyIndex, 23, 24) |
+      __gen_field(values->ChromaKeyMode, 22, 22) |
+      __gen_field(values->MaximumAnisotropy, 19, 21) |
+      __gen_field(values->RAddressMinFilterRoundingEnable, 13, 13) |
+      __gen_field(values->RAddressMagFilterRoundingEnable, 14, 14) |
+      __gen_field(values->VAddressMinFilterRoundingEnable, 15, 15) |
+      __gen_field(values->VAddressMagFilterRoundingEnable, 16, 16) |
+      __gen_field(values->UAddressMinFilterRoundingEnable, 17, 17) |
+      __gen_field(values->UAddressMagFilterRoundingEnable, 18, 18) |
+      __gen_field(values->TrilinearFilterQuality, 11, 12) |
+      __gen_field(values->NonnormalizedCoordinateEnable, 10, 10) |
+      __gen_field(values->TCXAddressControlMode, 6, 8) |
+      __gen_field(values->TCYAddressControlMode, 3, 5) |
+      __gen_field(values->TCZAddressControlMode, 0, 2) |
+      0;
+
+}
+
+/* Enum 3D_Prim_Topo_Type */
+#define     _3DPRIM_POINTLIST                                  1
+#define     _3DPRIM_LINELIST                                   2
+#define     _3DPRIM_LINESTRIP                                  3
+#define     _3DPRIM_TRILIST                                    4
+#define     _3DPRIM_TRISTRIP                                   5
+#define     _3DPRIM_TRIFAN                                     6
+#define     _3DPRIM_QUADLIST                                   7
+#define     _3DPRIM_QUADSTRIP                                  8
+#define     _3DPRIM_LINELIST_ADJ                               9
+#define     _3DPRIM_LINESTRIP_ADJ                             10
+#define     _3DPRIM_TRILIST_ADJ                               11
+#define     _3DPRIM_TRISTRIP_ADJ                              12
+#define     _3DPRIM_TRISTRIP_REVERSE                          13
+#define     _3DPRIM_POLYGON                                   14
+#define     _3DPRIM_RECTLIST                                  15
+#define     _3DPRIM_LINELOOP                                  16
+#define     _3DPRIM_POINTLIST_BF                              17
+#define     _3DPRIM_LINESTRIP_CONT                            18
+#define     _3DPRIM_LINESTRIP_BF                              19
+#define     _3DPRIM_LINESTRIP_CONT_BF                         20
+#define     _3DPRIM_TRIFAN_NOSTIPPLE                          22
+#define     _3DPRIM_PATCHLIST_1                               32
+#define     _3DPRIM_PATCHLIST_2                               33
+#define     _3DPRIM_PATCHLIST_3                               34
+#define     _3DPRIM_PATCHLIST_4                               35
+#define     _3DPRIM_PATCHLIST_5                               36
+#define     _3DPRIM_PATCHLIST_6                               37
+#define     _3DPRIM_PATCHLIST_7                               38
+#define     _3DPRIM_PATCHLIST_8                               39
+#define     _3DPRIM_PATCHLIST_9                               40
+#define     _3DPRIM_PATCHLIST_10                              41
+#define     _3DPRIM_PATCHLIST_11                              42
+#define     _3DPRIM_PATCHLIST_12                              43
+#define     _3DPRIM_PATCHLIST_13                              44
+#define     _3DPRIM_PATCHLIST_14                              45
+#define     _3DPRIM_PATCHLIST_15                              46
+#define     _3DPRIM_PATCHLIST_16                              47
+#define     _3DPRIM_PATCHLIST_17                              48
+#define     _3DPRIM_PATCHLIST_18                              49
+#define     _3DPRIM_PATCHLIST_19                              50
+#define     _3DPRIM_PATCHLIST_20                              51
+#define     _3DPRIM_PATCHLIST_21                              52
+#define     _3DPRIM_PATCHLIST_22                              53
+#define     _3DPRIM_PATCHLIST_23                              54
+#define     _3DPRIM_PATCHLIST_24                              55
+#define     _3DPRIM_PATCHLIST_25                              56
+#define     _3DPRIM_PATCHLIST_26                              57
+#define     _3DPRIM_PATCHLIST_27                              58
+#define     _3DPRIM_PATCHLIST_28                              59
+#define     _3DPRIM_PATCHLIST_29                              60
+#define     _3DPRIM_PATCHLIST_30                              61
+#define     _3DPRIM_PATCHLIST_31                              62
+#define     _3DPRIM_PATCHLIST_32                              63
+
+/* Enum 3D_Vertex_Component_Control */
+#define     VFCOMP_NOSTORE                                     0
+#define     VFCOMP_STORE_SRC                                   1
+#define     VFCOMP_STORE_0                                     2
+#define     VFCOMP_STORE_1_FP                                  3
+#define     VFCOMP_STORE_1_INT                                 4
+#define     VFCOMP_STORE_VID                                   5
+#define     VFCOMP_STORE_IID                                   6
+#define     VFCOMP_STORE_PID                                   7
+
+/* Enum 3D_Compare_Function */
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+
+/* Enum SURFACE_FORMAT */
+#define     R32G32B32A32_FLOAT                                 0
+#define     R32G32B32A32_SINT                                  1
+#define     R32G32B32A32_UINT                                  2
+#define     R32G32B32A32_UNORM                                 3
+#define     R32G32B32A32_SNORM                                 4
+#define     R64G64_FLOAT                                       5
+#define     R32G32B32X32_FLOAT                                 6
+#define     R32G32B32A32_SSCALED                               7
+#define     R32G32B32A32_USCALED                               8
+#define     R32G32B32A32_SFIXED                               32
+#define     R64G64_PASSTHRU                                   33
+#define     R32G32B32_FLOAT                                   64
+#define     R32G32B32_SINT                                    65
+#define     R32G32B32_UINT                                    66
+#define     R32G32B32_UNORM                                   67
+#define     R32G32B32_SNORM                                   68
+#define     R32G32B32_SSCALED                                 69
+#define     R32G32B32_USCALED                                 70
+#define     R32G32B32_SFIXED                                  80
+#define     R16G16B16A16_UNORM                               128
+#define     R16G16B16A16_SNORM                               129
+#define     R16G16B16A16_SINT                                130
+#define     R16G16B16A16_UINT                                131
+#define     R16G16B16A16_FLOAT                               132
+#define     R32G32_FLOAT                                     133
+#define     R32G32_SINT                                      134
+#define     R32G32_UINT                                      135
+#define     R32_FLOAT_X8X24_TYPELESS                         136
+#define     X32_TYPELESS_G8X24_UINT                          137
+#define     L32A32_FLOAT                                     138
+#define     R32G32_UNORM                                     139
+#define     R32G32_SNORM                                     140
+#define     R64_FLOAT                                        141
+#define     R16G16B16X16_UNORM                               142
+#define     R16G16B16X16_FLOAT                               143
+#define     A32X32_FLOAT                                     144
+#define     L32X32_FLOAT                                     145
+#define     I32X32_FLOAT                                     146
+#define     R16G16B16A16_SSCALED                             147
+#define     R16G16B16A16_USCALED                             148
+#define     R32G32_SSCALED                                   149
+#define     R32G32_USCALED                                   150
+#define     R32G32_SFIXED                                    160
+#define     R64_PASSTHRU                                     161
+#define     B8G8R8A8_UNORM                                   192
+#define     B8G8R8A8_UNORM_SRGB                              193
+#define     R10G10B10A2_UNORM                                194
+#define     R10G10B10A2_UNORM_SRGB                           195
+#define     R10G10B10A2_UINT                                 196
+#define     R10G10B10_SNORM_A2_UNORM                         197
+#define     R8G8B8A8_UNORM                                   199
+#define     R8G8B8A8_UNORM_SRGB                              200
+#define     R8G8B8A8_SNORM                                   201
+#define     R8G8B8A8_SINT                                    202
+#define     R8G8B8A8_UINT                                    203
+#define     R16G16_UNORM                                     204
+#define     R16G16_SNORM                                     205
+#define     R16G16_SINT                                      206
+#define     R16G16_UINT                                      207
+#define     R16G16_FLOAT                                     208
+#define     B10G10R10A2_UNORM                                209
+#define     B10G10R10A2_UNORM_SRGB                           210
+#define     R11G11B10_FLOAT                                  211
+#define     R32_SINT                                         214
+#define     R32_UINT                                         215
+#define     R32_FLOAT                                        216
+#define     R24_UNORM_X8_TYPELESS                            217
+#define     X24_TYPELESS_G8_UINT                             218
+#define     L32_UNORM                                        221
+#define     A32_UNORM                                        222
+#define     L16A16_UNORM                                     223
+#define     I24X8_UNORM                                      224
+#define     L24X8_UNORM                                      225
+#define     A24X8_UNORM                                      226
+#define     I32_FLOAT                                        227
+#define     L32_FLOAT                                        228
+#define     A32_FLOAT                                        229
+#define     X8B8_UNORM_G8R8_SNORM                            230
+#define     A8X8_UNORM_G8R8_SNORM                            231
+#define     B8X8_UNORM_G8R8_SNORM                            232
+#define     B8G8R8X8_UNORM                                   233
+#define     B8G8R8X8_UNORM_SRGB                              234
+#define     R8G8B8X8_UNORM                                   235
+#define     R8G8B8X8_UNORM_SRGB                              236
+#define     R9G9B9E5_SHAREDEXP                               237
+#define     B10G10R10X2_UNORM                                238
+#define     L16A16_FLOAT                                     240
+#define     R32_UNORM                                        241
+#define     R32_SNORM                                        242
+#define     R10G10B10X2_USCALED                              243
+#define     R8G8B8A8_SSCALED                                 244
+#define     R8G8B8A8_USCALED                                 245
+#define     R16G16_SSCALED                                   246
+#define     R16G16_USCALED                                   247
+#define     R32_SSCALED                                      248
+#define     R32_USCALED                                      249
+#define     B5G6R5_UNORM                                     256
+#define     B5G6R5_UNORM_SRGB                                257
+#define     B5G5R5A1_UNORM                                   258
+#define     B5G5R5A1_UNORM_SRGB                              259
+#define     B4G4R4A4_UNORM                                   260
+#define     B4G4R4A4_UNORM_SRGB                              261
+#define     R8G8_UNORM                                       262
+#define     R8G8_SNORM                                       263
+#define     R8G8_SINT                                        264
+#define     R8G8_UINT                                        265
+#define     R16_UNORM                                        266
+#define     R16_SNORM                                        267
+#define     R16_SINT                                         268
+#define     R16_UINT                                         269
+#define     R16_FLOAT                                        270
+#define     A8P8_UNORM_PALETTE0                              271
+#define     A8P8_UNORM_PALETTE1                              272
+#define     I16_UNORM                                        273
+#define     L16_UNORM                                        274
+#define     A16_UNORM                                        275
+#define     L8A8_UNORM                                       276
+#define     I16_FLOAT                                        277
+#define     L16_FLOAT                                        278
+#define     A16_FLOAT                                        279
+#define     L8A8_UNORM_SRGB                                  280
+#define     R5G5_SNORM_B6_UNORM                              281
+#define     B5G5R5X1_UNORM                                   282
+#define     B5G5R5X1_UNORM_SRGB                              283
+#define     R8G8_SSCALED                                     284
+#define     R8G8_USCALED                                     285
+#define     R16_SSCALED                                      286
+#define     R16_USCALED                                      287
+#define     P8A8_UNORM_PALETTE0                              290
+#define     P8A8_UNORM_PALETTE1                              291
+#define     A1B5G5R5_UNORM                                   292
+#define     A4B4G4R4_UNORM                                   293
+#define     L8A8_UINT                                        294
+#define     L8A8_SINT                                        295
+#define     R8_UNORM                                         320
+#define     R8_SNORM                                         321
+#define     R8_SINT                                          322
+#define     R8_UINT                                          323
+#define     A8_UNORM                                         324
+#define     I8_UNORM                                         325
+#define     L8_UNORM                                         326
+#define     P4A4_UNORM_PALETTE0                              327
+#define     A4P4_UNORM_PALETTE0                              328
+#define     R8_SSCALED                                       329
+#define     R8_USCALED                                       330
+#define     P8_UNORM_PALETTE0                                331
+#define     L8_UNORM_SRGB                                    332
+#define     P8_UNORM_PALETTE1                                333
+#define     P4A4_UNORM_PALETTE1                              334
+#define     A4P4_UNORM_PALETTE1                              335
+#define     Y8_UNORM                                         336
+#define     L8_UINT                                          338
+#define     L8_SINT                                          339
+#define     I8_UINT                                          340
+#define     I8_SINT                                          341
+#define     DXT1_RGB_SRGB                                    384
+#define     R1_UNORM                                         385
+#define     YCRCB_NORMAL                                     386
+#define     YCRCB_SWAPUVY                                    387
+#define     P2_UNORM_PALETTE0                                388
+#define     P2_UNORM_PALETTE1                                389
+#define     BC1_UNORM                                        390
+#define     BC2_UNORM                                        391
+#define     BC3_UNORM                                        392
+#define     BC4_UNORM                                        393
+#define     BC5_UNORM                                        394
+#define     BC1_UNORM_SRGB                                   395
+#define     BC2_UNORM_SRGB                                   396
+#define     BC3_UNORM_SRGB                                   397
+#define     MONO8                                            398
+#define     YCRCB_SWAPUV                                     399
+#define     YCRCB_SWAPY                                      400
+#define     DXT1_RGB                                         401
+#define     FXT1                                             402
+#define     R8G8B8_UNORM                                     403
+#define     R8G8B8_SNORM                                     404
+#define     R8G8B8_SSCALED                                   405
+#define     R8G8B8_USCALED                                   406
+#define     R64G64B64A64_FLOAT                               407
+#define     R64G64B64_FLOAT                                  408
+#define     BC4_SNORM                                        409
+#define     BC5_SNORM                                        410
+#define     R16G16B16_FLOAT                                  411
+#define     R16G16B16_UNORM                                  412
+#define     R16G16B16_SNORM                                  413
+#define     R16G16B16_SSCALED                                414
+#define     R16G16B16_USCALED                                415
+#define     BC6H_SF16                                        417
+#define     BC7_UNORM                                        418
+#define     BC7_UNORM_SRGB                                   419
+#define     BC6H_UF16                                        420
+#define     PLANAR_420_8                                     421
+#define     R8G8B8_UNORM_SRGB                                424
+#define     ETC1_RGB8                                        425
+#define     ETC2_RGB8                                        426
+#define     EAC_R11                                          427
+#define     EAC_RG11                                         428
+#define     EAC_SIGNED_R11                                   429
+#define     EAC_SIGNED_RG11                                  430
+#define     ETC2_SRGB8                                       431
+#define     R16G16B16_UINT                                   432
+#define     R16G16B16_SINT                                   433
+#define     R32_SFIXED                                       434
+#define     R10G10B10A2_SNORM                                435
+#define     R10G10B10A2_USCALED                              436
+#define     R10G10B10A2_SSCALED                              437
+#define     R10G10B10A2_SINT                                 438
+#define     B10G10R10A2_SNORM                                439
+#define     B10G10R10A2_USCALED                              440
+#define     B10G10R10A2_SSCALED                              441
+#define     B10G10R10A2_UINT                                 442
+#define     B10G10R10A2_SINT                                 443
+#define     R64G64B64A64_PASSTHRU                            444
+#define     R64G64B64_PASSTHRU                               445
+#define     ETC2_RGB8_PTA                                    448
+#define     ETC2_SRGB8_PTA                                   449
+#define     ETC2_EAC_RGBA8                                   450
+#define     ETC2_EAC_SRGB8_A8                                451
+#define     R8G8B8_UINT                                      456
+#define     R8G8B8_SINT                                      457
+#define     RAW                                              511
+
+/* Enum Texture Coordinate Mode */
+#define     TCM_WRAP                                           0
+#define     TCM_MIRROR                                         1
+#define     TCM_CLAMP                                          2
+#define     TCM_CUBE                                           3
+#define     TCM_CLAMP_BORDER                                   4
+#define     TCM_MIRROR_ONCE                                    5
+
diff --git a/src/vulkan/gen7_pipeline.c b/src/vulkan/gen7_pipeline.c

new file mode 100644 (file)

index 0000000..9b90c6e
--- /dev/null
+++ b/src/vulkan/gen7_pipeline.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "gen7_pack.h"
+#include "gen75_pack.h"
+
+#include "genX_pipeline_util.h"
+
+static void
+emit_vertex_input(struct anv_pipeline *pipeline,
+                  const VkPipelineVertexInputStateCreateInfo *info,
+                  const struct anv_graphics_pipeline_create_info *extra)
+{
+   uint32_t elements;
+   if (extra && extra->disable_vs) {
+      /* If the VS is disabled, just assume the user knows what they're
+       * doing and apply the layout blindly.  This can only come from
+       * meta, so this *should* be safe.
+       */
+      elements = 0;
+      for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++)
+         elements |= (1 << info->pVertexAttributeDescriptions[i].location);
+   } else {
+      /* Pull inputs_read out of the VS prog data */
+      uint64_t inputs_read = pipeline->vs_prog_data.inputs_read;
+      assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
+      elements = inputs_read >> VERT_ATTRIB_GENERIC0;
+   }
+
+   uint32_t vb_count = __builtin_popcount(elements);
+
+   if (pipeline->vs_prog_data.uses_vertexid ||
+       pipeline->vs_prog_data.uses_instanceid)
+      vb_count++;
+
+   if (vb_count == 0)
+      return;
+
+   const uint32_t num_dwords = 1 + vb_count * 2;
+
+   uint32_t *p = anv_batch_emitn(&pipeline->batch, num_dwords,
+                                 GEN7_3DSTATE_VERTEX_ELEMENTS);
+   memset(p + 1, 0, (num_dwords - 1) * 4);
+
+   for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
+      const VkVertexInputAttributeDescription *desc =
+         &info->pVertexAttributeDescriptions[i];
+      enum isl_format format = anv_get_isl_format(desc->format,
+                                                  VK_IMAGE_ASPECT_COLOR_BIT,
+                                                  VK_IMAGE_TILING_LINEAR);
+
+      assert(desc->binding < 32);
+
+      if ((elements & (1 << desc->location)) == 0)
+         continue; /* Binding unused */
+
+      uint32_t slot = __builtin_popcount(elements & ((1 << desc->location) - 1));
+
+      struct GEN7_VERTEX_ELEMENT_STATE element = {
+         .VertexBufferIndex = desc->binding,
+         .Valid = true,
+         .SourceElementFormat = format,
+         .EdgeFlagEnable = false,
+         .SourceElementOffset = desc->offset,
+         .Component0Control = vertex_element_comp_control(format, 0),
+         .Component1Control = vertex_element_comp_control(format, 1),
+         .Component2Control = vertex_element_comp_control(format, 2),
+         .Component3Control = vertex_element_comp_control(format, 3),
+      };
+      GEN7_VERTEX_ELEMENT_STATE_pack(NULL, &p[1 + slot * 2], &element);
+   }
+
+   if (pipeline->vs_prog_data.uses_vertexid ||
+       pipeline->vs_prog_data.uses_instanceid) {
+      struct GEN7_VERTEX_ELEMENT_STATE element = {
+         .Valid = true,
+         /* FIXME: Do we need to provide the base vertex as component 0 here
+          * to support the correct base vertex ID? */
+         .Component0Control = VFCOMP_STORE_0,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_VID,
+         .Component3Control = VFCOMP_STORE_IID
+      };
+      GEN7_VERTEX_ELEMENT_STATE_pack(NULL, &p[1 + (vb_count - 1) * 2], &element);
+   }
+}
+
+static void
+gen7_emit_rs_state(struct anv_pipeline *pipeline,
+                   const VkPipelineRasterizationStateCreateInfo *info,
+                   const struct anv_graphics_pipeline_create_info *extra)
+{
+   struct GEN7_3DSTATE_SF sf = {
+      GEN7_3DSTATE_SF_header,
+
+      /* FIXME: Get this from pass info */
+      .DepthBufferSurfaceFormat                 = D24_UNORM_X8_UINT,
+
+      /* LegacyGlobalDepthBiasEnable */
+
+      .StatisticsEnable                         = true,
+      .FrontFaceFillMode                        = vk_to_gen_fillmode[info->polygonMode],
+      .BackFaceFillMode                         = vk_to_gen_fillmode[info->polygonMode],
+      .ViewTransformEnable                      = !(extra && extra->disable_viewport),
+      .FrontWinding                             = vk_to_gen_front_face[info->frontFace],
+      /* bool                                         AntiAliasingEnable; */
+
+      .CullMode                                 = vk_to_gen_cullmode[info->cullMode],
+
+      /* uint32_t                                     LineEndCapAntialiasingRegionWidth; */
+      .ScissorRectangleEnable                   =  !(extra && extra->disable_scissor),
+
+      /* uint32_t                                     MultisampleRasterizationMode; */
+      /* bool                                         LastPixelEnable; */
+
+      .TriangleStripListProvokingVertexSelect   = 0,
+      .LineStripListProvokingVertexSelect       = 0,
+      .TriangleFanProvokingVertexSelect         = 0,
+
+      /* uint32_t                                     AALineDistanceMode; */
+      /* uint32_t                                     VertexSubPixelPrecisionSelect; */
+      .UsePointWidthState                       = !pipeline->writes_point_size,
+      .PointWidth                               = 1.0,
+   };
+
+   GEN7_3DSTATE_SF_pack(NULL, &pipeline->gen7.sf, &sf);
+}
+
+static void
+gen7_emit_ds_state(struct anv_pipeline *pipeline,
+                   const VkPipelineDepthStencilStateCreateInfo *info)
+{
+   if (info == NULL) {
+      /* We're going to OR this together with the dynamic state.  We need
+       * to make sure it's initialized to something useful.
+       */
+      memset(pipeline->gen7.depth_stencil_state, 0,
+             sizeof(pipeline->gen7.depth_stencil_state));
+      return;
+   }
+
+   struct GEN7_DEPTH_STENCIL_STATE state = {
+      .DepthTestEnable = info->depthTestEnable,
+      .DepthBufferWriteEnable = info->depthWriteEnable,
+      .DepthTestFunction = vk_to_gen_compare_op[info->depthCompareOp],
+      .DoubleSidedStencilEnable = true,
+
+      .StencilTestEnable = info->stencilTestEnable,
+      .StencilFailOp = vk_to_gen_stencil_op[info->front.failOp],
+      .StencilPassDepthPassOp = vk_to_gen_stencil_op[info->front.passOp],
+      .StencilPassDepthFailOp = vk_to_gen_stencil_op[info->front.depthFailOp],
+      .StencilTestFunction = vk_to_gen_compare_op[info->front.compareOp],
+
+      .BackfaceStencilFailOp = vk_to_gen_stencil_op[info->back.failOp],
+      .BackfaceStencilPassDepthPassOp = vk_to_gen_stencil_op[info->back.passOp],
+      .BackfaceStencilPassDepthFailOp = vk_to_gen_stencil_op[info->back.depthFailOp],
+      .BackFaceStencilTestFunction = vk_to_gen_compare_op[info->back.compareOp],
+   };
+
+   GEN7_DEPTH_STENCIL_STATE_pack(NULL, &pipeline->gen7.depth_stencil_state, &state);
+}
+
+static void
+gen7_emit_cb_state(struct anv_pipeline *pipeline,
+                   const VkPipelineColorBlendStateCreateInfo *info,
+                   const VkPipelineMultisampleStateCreateInfo *ms_info)
+{
+   struct anv_device *device = pipeline->device;
+
+   if (info->pAttachments == NULL) {
+      pipeline->blend_state =
+         anv_state_pool_emit(&device->dynamic_state_pool,
+            GEN7_BLEND_STATE, 64,
+            .ColorBufferBlendEnable = false,
+            .WriteDisableAlpha = false,
+            .WriteDisableRed = false,
+            .WriteDisableGreen = false,
+            .WriteDisableBlue = false);
+   } else {
+      /* FIXME-GEN7: All render targets share blend state settings on gen7, we
+       * can't implement this.
+       */
+      const VkPipelineColorBlendAttachmentState *a = &info->pAttachments[0];
+      pipeline->blend_state =
+         anv_state_pool_emit(&device->dynamic_state_pool,
+            GEN7_BLEND_STATE, 64,
+
+            .ColorBufferBlendEnable = a->blendEnable,
+            .IndependentAlphaBlendEnable = true, /* FIXME: yes? */
+            .AlphaBlendFunction = vk_to_gen_blend_op[a->alphaBlendOp],
+
+            .SourceAlphaBlendFactor = vk_to_gen_blend[a->srcAlphaBlendFactor],
+            .DestinationAlphaBlendFactor = vk_to_gen_blend[a->dstAlphaBlendFactor],
+
+            .ColorBlendFunction = vk_to_gen_blend_op[a->colorBlendOp],
+            .SourceBlendFactor = vk_to_gen_blend[a->srcColorBlendFactor],
+            .DestinationBlendFactor = vk_to_gen_blend[a->dstColorBlendFactor],
+            .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
+
+#     if 0
+            bool                                AlphaToOneEnable;
+            bool                                AlphaToCoverageDitherEnable;
+#     endif
+
+            .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
+            .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
+            .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
+            .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
+
+            .LogicOpEnable = info->logicOpEnable,
+            .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
+
+#     if 0
+            bool                                AlphaTestEnable;
+            uint32_t                            AlphaTestFunction;
+            bool                                ColorDitherEnable;
+            uint32_t                            XDitherOffset;
+            uint32_t                            YDitherOffset;
+            uint32_t                            ColorClampRange;
+            bool                                PreBlendColorClampEnable;
+            bool                                PostBlendColorClampEnable;
+#     endif
+            );
+    }
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_BLEND_STATE_POINTERS,
+                  .BlendStatePointer = pipeline->blend_state.offset);
+}
+
+static inline uint32_t
+scratch_space(const struct brw_stage_prog_data *prog_data)
+{
+   return ffs(prog_data->total_scratch / 1024);
+}
+
+GENX_FUNC(GEN7, GEN75) VkResult
+genX(graphics_pipeline_create)(
+    VkDevice                                    _device,
+    struct anv_pipeline_cache *                 cache,
+    const VkGraphicsPipelineCreateInfo*         pCreateInfo,
+    const struct anv_graphics_pipeline_create_info *extra,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipeline)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_pipeline *pipeline;
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
+   
+   pipeline = anv_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_pipeline_init(pipeline, device, cache,
+                              pCreateInfo, extra, pAllocator);
+   if (result != VK_SUCCESS) {
+      anv_free2(&device->alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   assert(pCreateInfo->pVertexInputState);
+   emit_vertex_input(pipeline, pCreateInfo->pVertexInputState, extra);
+
+   assert(pCreateInfo->pRasterizationState);
+   gen7_emit_rs_state(pipeline, pCreateInfo->pRasterizationState, extra);
+
+   gen7_emit_ds_state(pipeline, pCreateInfo->pDepthStencilState);
+
+   gen7_emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
+                                pCreateInfo->pMultisampleState);
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_VF_STATISTICS,
+                   .StatisticsEnable = true);
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_HS, .Enable = false);
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_TE, .TEEnable = false);
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_DS, .DSFunctionEnable = false);
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_STREAMOUT, .SOFunctionEnable = false);
+
+   /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
+    *
+    *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
+    *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
+    *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
+    *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
+    *    needs to be sent before any combination of VS associated 3DSTATE."
+    */
+   anv_batch_emit(&pipeline->batch, GEN7_PIPE_CONTROL,
+                  .DepthStallEnable = true,
+                  .PostSyncOperation = WriteImmediateData,
+                  .Address = { &device->workaround_bo, 0 });
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS,
+                  .ConstantBufferOffset = 0,
+                  .ConstantBufferSize = 4);
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_GS,
+                  .ConstantBufferOffset = 4,
+                  .ConstantBufferSize = 4);
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS,
+                  .ConstantBufferOffset = 8,
+                  .ConstantBufferSize = 4);
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_AA_LINE_PARAMETERS);
+
+   const VkPipelineRasterizationStateCreateInfo *rs_info =
+      pCreateInfo->pRasterizationState;
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_CLIP,
+      .FrontWinding                             = vk_to_gen_front_face[rs_info->frontFace],
+      .CullMode                                 = vk_to_gen_cullmode[rs_info->cullMode],
+      .ClipEnable                               = true,
+      .APIMode                                  = APIMODE_OGL,
+      .ViewportXYClipTestEnable                 = !(extra && extra->disable_viewport),
+      .ClipMode                                 = CLIPMODE_NORMAL,
+      .TriangleStripListProvokingVertexSelect   = 0,
+      .LineStripListProvokingVertexSelect       = 0,
+      .TriangleFanProvokingVertexSelect         = 0,
+      .MinimumPointWidth                        = 0.125,
+      .MaximumPointWidth                        = 255.875);
+
+   uint32_t samples = 1;
+   uint32_t log2_samples = __builtin_ffs(samples) - 1;
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_MULTISAMPLE,
+      .PixelLocation                            = PIXLOC_CENTER,
+      .NumberofMultisamples                     = log2_samples);
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_SAMPLE_MASK,
+      .SampleMask                               = 0xff);
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_URB_VS,
+      .VSURBStartingAddress                     = pipeline->urb.vs_start,
+      .VSURBEntryAllocationSize                 = pipeline->urb.vs_size - 1,
+      .VSNumberofURBEntries                     = pipeline->urb.nr_vs_entries);
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_URB_GS,
+      .GSURBStartingAddress                     = pipeline->urb.gs_start,
+      .GSURBEntryAllocationSize                 = pipeline->urb.gs_size - 1,
+      .GSNumberofURBEntries                     = pipeline->urb.nr_gs_entries);
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_URB_HS,
+      .HSURBStartingAddress                     = pipeline->urb.vs_start,
+      .HSURBEntryAllocationSize                 = 0,
+      .HSNumberofURBEntries                     = 0);
+
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_URB_DS,
+      .DSURBStartingAddress                     = pipeline->urb.vs_start,
+      .DSURBEntryAllocationSize                 = 0,
+      .DSNumberofURBEntries                     = 0);
+
+   const struct brw_vue_prog_data *vue_prog_data = &pipeline->vs_prog_data.base;
+   /* The last geometry producing stage will set urb_offset and urb_length,
+    * which we use in 3DSTATE_SBE. Skip the VUE header and position slots. */
+   uint32_t urb_offset = 1;
+   uint32_t urb_length = (vue_prog_data->vue_map.num_slots + 1) / 2 - urb_offset;
+
+#if 0 
+   /* From gen7_vs_state.c */
+
+   /**
+    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
+    * Geometry > Geometry Shader > State:
+    *
+    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
+    *     whole fixed function pipeline when the GS enable changes value in
+    *     the 3DSTATE_GS."
+    *
+    * The hardware architects have clarified that in this context "flush the
+    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
+    * Stall" bit set.
+    */
+   if (!brw->is_haswell && !brw->is_baytrail)
+      gen7_emit_vs_workaround_flush(brw);
+#endif
+
+   if (pipeline->vs_vec4 == NO_KERNEL || (extra && extra->disable_vs))
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VS), .VSFunctionEnable = false);
+   else
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VS),
+         .KernelStartPointer                    = pipeline->vs_vec4,
+         .ScratchSpaceBaseOffset                = pipeline->scratch_start[MESA_SHADER_VERTEX],
+         .PerThreadScratchSpace                 = scratch_space(&vue_prog_data->base),
+
+         .DispatchGRFStartRegisterforURBData    =
+            vue_prog_data->base.dispatch_grf_start_reg,
+         .VertexURBEntryReadLength              = vue_prog_data->urb_read_length,
+         .VertexURBEntryReadOffset              = 0,
+
+         .MaximumNumberofThreads                = device->info.max_vs_threads - 1,
+         .StatisticsEnable                      = true,
+         .VSFunctionEnable                      = true);
+
+   const struct brw_gs_prog_data *gs_prog_data = &pipeline->gs_prog_data;
+
+   if (pipeline->gs_kernel == NO_KERNEL || (extra && extra->disable_vs)) {
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS), .GSEnable = false);
+   } else {
+      urb_offset = 1;
+      urb_length = (gs_prog_data->base.vue_map.num_slots + 1) / 2 - urb_offset;
+
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS),
+         .KernelStartPointer                    = pipeline->gs_kernel,
+         .ScratchSpaceBasePointer               = pipeline->scratch_start[MESA_SHADER_GEOMETRY],
+         .PerThreadScratchSpace                 = scratch_space(&gs_prog_data->base.base),
+
+         .OutputVertexSize                      = gs_prog_data->output_vertex_size_hwords * 2 - 1,
+         .OutputTopology                        = gs_prog_data->output_topology,
+         .VertexURBEntryReadLength              = gs_prog_data->base.urb_read_length,
+         .IncludeVertexHandles                  = gs_prog_data->base.include_vue_handles,
+         .DispatchGRFStartRegisterforURBData    =
+            gs_prog_data->base.base.dispatch_grf_start_reg,
+
+         .MaximumNumberofThreads                = device->info.max_gs_threads - 1,
+         /* This in the next dword on HSW. */
+         .ControlDataFormat                     = gs_prog_data->control_data_format,
+         .ControlDataHeaderSize                 = gs_prog_data->control_data_header_size_hwords,
+         .InstanceControl                       = MAX2(gs_prog_data->invocations, 1) - 1,
+         .DispatchMode                          = gs_prog_data->base.dispatch_mode,
+         .GSStatisticsEnable                    = true,
+         .IncludePrimitiveID                    = gs_prog_data->include_primitive_id,
+#     if (ANV_IS_HASWELL)
+         .ReorderMode                           = REORDER_TRAILING,
+#     else
+         .ReorderEnable                         = true,
+#     endif
+         .GSEnable                              = true);
+   }
+
+   const struct brw_wm_prog_data *wm_prog_data = &pipeline->wm_prog_data;
+   if (wm_prog_data->urb_setup[VARYING_SLOT_BFC0] != -1 ||
+       wm_prog_data->urb_setup[VARYING_SLOT_BFC1] != -1)
+      anv_finishme("two-sided color needs sbe swizzling setup");
+   if (wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] != -1)
+      anv_finishme("primitive_id needs sbe swizzling setup");
+
+   /* FIXME: generated header doesn't emit attr swizzle fields */
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_SBE,
+      .NumberofSFOutputAttributes               = pipeline->wm_prog_data.num_varying_inputs,
+      .VertexURBEntryReadLength                 = urb_length,
+      .VertexURBEntryReadOffset                 = urb_offset,
+      .PointSpriteTextureCoordinateOrigin       = UPPERLEFT);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS),
+      .KernelStartPointer0                      = pipeline->ps_ksp0,
+      .ScratchSpaceBasePointer                  = pipeline->scratch_start[MESA_SHADER_FRAGMENT],
+      .PerThreadScratchSpace                    = scratch_space(&wm_prog_data->base),
+                  
+      .MaximumNumberofThreads                   = device->info.max_wm_threads - 1,
+      .PushConstantEnable                       = wm_prog_data->base.nr_params > 0,
+      .AttributeEnable                          = wm_prog_data->num_varying_inputs > 0,
+      .oMaskPresenttoRenderTarget               = wm_prog_data->uses_omask,
+
+      .RenderTargetFastClearEnable              = false,
+      .DualSourceBlendEnable                    = false,
+      .RenderTargetResolveEnable                = false,
+
+      .PositionXYOffsetSelect                   = wm_prog_data->uses_pos_offset ?
+         POSOFFSET_SAMPLE : POSOFFSET_NONE,
+
+      ._32PixelDispatchEnable                   = false,
+      ._16PixelDispatchEnable                   = pipeline->ps_simd16 != NO_KERNEL,
+      ._8PixelDispatchEnable                    = pipeline->ps_simd8 != NO_KERNEL,
+
+      .DispatchGRFStartRegisterforConstantSetupData0 = pipeline->ps_grf_start0,
+      .DispatchGRFStartRegisterforConstantSetupData1 = 0,
+      .DispatchGRFStartRegisterforConstantSetupData2 = pipeline->ps_grf_start2,
+
+#if 0
+   /* Haswell requires the sample mask to be set in this packet as well as
+    * in 3DSTATE_SAMPLE_MASK; the values should match. */
+   /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
+#endif
+
+      .KernelStartPointer1                      = 0,
+      .KernelStartPointer2                      = pipeline->ps_ksp2);
+
+   /* FIXME-GEN7: This needs a lot more work, cf gen7 upload_wm_state(). */
+   anv_batch_emit(&pipeline->batch, GEN7_3DSTATE_WM,
+      .StatisticsEnable                         = true,
+      .ThreadDispatchEnable                     = true,
+      .LineEndCapAntialiasingRegionWidth        = 0, /* 0.5 pixels */
+      .LineAntialiasingRegionWidth              = 1, /* 1.0 pixels */
+      .EarlyDepthStencilControl                 = EDSC_NORMAL,
+      .PointRasterizationRule                   = RASTRULE_UPPER_RIGHT,
+      .PixelShaderComputedDepthMode             = wm_prog_data->computed_depth_mode,
+      .BarycentricInterpolationMode             = wm_prog_data->barycentric_interp_modes);
+
+   *pPipeline = anv_pipeline_to_handle(pipeline);
+
+   return VK_SUCCESS;
+}
+
+GENX_FUNC(GEN7, GEN75) VkResult
+genX(compute_pipeline_create)(
+    VkDevice                                    _device,
+    struct anv_pipeline_cache *                 cache,
+    const VkComputePipelineCreateInfo*          pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipeline)
+{
+   anv_finishme("primitive_id needs sbe swizzling setup");
+   abort();
+}
diff --git a/src/vulkan/gen7_state.c b/src/vulkan/gen7_state.c

new file mode 100644 (file)

index 0000000..0c830d5
--- /dev/null
+++ b/src/vulkan/gen7_state.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "gen7_pack.h"
+#include "gen75_pack.h"
+
+#include "genX_state_util.h"
+
+GENX_FUNC(GEN7, GEN75) void
+genX(fill_buffer_surface_state)(void *state, enum isl_format format,
+                                uint32_t offset, uint32_t range,
+                                uint32_t stride)
+{
+   uint32_t num_elements = range / stride;
+
+   struct GENX(RENDER_SURFACE_STATE) surface_state = {
+      .SurfaceType                              = SURFTYPE_BUFFER,
+      .SurfaceFormat                            = format,
+      .SurfaceVerticalAlignment                 = VALIGN_4,
+      .SurfaceHorizontalAlignment               = HALIGN_4,
+      .TiledSurface                             = false,
+      .RenderCacheReadWriteMode                 = false,
+      .SurfaceObjectControlState                = GENX(MOCS),
+      .Height                                   = (num_elements >> 7) & 0x3fff,
+      .Width                                    = num_elements & 0x7f,
+      .Depth                                    = (num_elements >> 21) & 0x3f,
+      .SurfacePitch                             = stride - 1,
+#  if (ANV_IS_HASWELL)
+      .ShaderChannelSelectR                     = SCS_RED,
+      .ShaderChannelSelectG                     = SCS_GREEN,
+      .ShaderChannelSelectB                     = SCS_BLUE,
+      .ShaderChannelSelectA                     = SCS_ALPHA,
+#  endif
+      .SurfaceBaseAddress                       = { NULL, offset },
+   };
+
+   GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &surface_state);
+}
+
+static struct anv_state
+alloc_surface_state(struct anv_device *device,
+                    struct anv_cmd_buffer *cmd_buffer)
+{
+      if (cmd_buffer) {
+         return anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+      } else {
+         return anv_state_pool_alloc(&device->surface_state_pool, 64, 64);
+      }
+}
+
+VkResult genX(CreateSampler)(
+    VkDevice                                    _device,
+    const VkSamplerCreateInfo*                  pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSampler*                                  pSampler)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_sampler *sampler;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO);
+
+   sampler = anv_alloc2(&device->alloc, pAllocator, sizeof(*sampler), 8,
+                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!sampler)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   uint32_t filter = vk_to_gen_tex_filter(pCreateInfo->magFilter,
+                                          pCreateInfo->anisotropyEnable);
+
+   struct GEN7_SAMPLER_STATE sampler_state = {
+      .SamplerDisable = false,
+      .TextureBorderColorMode = DX10OGL,
+      .LODPreClampEnable = OGL,
+      .BaseMipLevel = 0.0,
+      .MipModeFilter = vk_to_gen_mipmap_mode[pCreateInfo->mipmapMode],
+      .MagModeFilter = filter,
+      .MinModeFilter = filter,
+      .TextureLODBias = pCreateInfo->mipLodBias * 256,
+      .AnisotropicAlgorithm = EWAApproximation,
+      .MinLOD = pCreateInfo->minLod,
+      .MaxLOD = pCreateInfo->maxLod,
+      .ChromaKeyEnable = 0,
+      .ChromaKeyIndex = 0,
+      .ChromaKeyMode = 0,
+      .ShadowFunction = vk_to_gen_compare_op[pCreateInfo->compareOp],
+      .CubeSurfaceControlMode = 0,
+
+      .BorderColorPointer =
+         device->border_colors.offset +
+         pCreateInfo->borderColor * sizeof(float) * 4,
+
+      .MaximumAnisotropy = vk_to_gen_max_anisotropy(pCreateInfo->maxAnisotropy),
+      .RAddressMinFilterRoundingEnable = 0,
+      .RAddressMagFilterRoundingEnable = 0,
+      .VAddressMinFilterRoundingEnable = 0,
+      .VAddressMagFilterRoundingEnable = 0,
+      .UAddressMinFilterRoundingEnable = 0,
+      .UAddressMagFilterRoundingEnable = 0,
+      .TrilinearFilterQuality = 0,
+      .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
+      .TCXAddressControlMode = vk_to_gen_tex_address[pCreateInfo->addressModeU],
+      .TCYAddressControlMode = vk_to_gen_tex_address[pCreateInfo->addressModeV],
+      .TCZAddressControlMode = vk_to_gen_tex_address[pCreateInfo->addressModeW],
+   };
+
+   GEN7_SAMPLER_STATE_pack(NULL, sampler->state, &sampler_state);
+
+   *pSampler = anv_sampler_to_handle(sampler);
+
+   return VK_SUCCESS;
+}
+
+static const uint8_t anv_halign[] = {
+    [4] = HALIGN_4,
+    [8] = HALIGN_8,
+};
+
+static const uint8_t anv_valign[] = {
+    [2] = VALIGN_2,
+    [4] = VALIGN_4,
+};
+
+GENX_FUNC(GEN7, GEN75) void
+genX(image_view_init)(struct anv_image_view *iview,
+                      struct anv_device *device,
+                      const VkImageViewCreateInfo* pCreateInfo,
+                      struct anv_cmd_buffer *cmd_buffer)
+{
+   ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
+
+   const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
+
+   struct anv_surface *surface =
+      anv_image_get_surface_for_aspect_mask(image, range->aspectMask);
+
+   if (pCreateInfo->viewType != VK_IMAGE_VIEW_TYPE_2D)
+      anv_finishme("non-2D image views");
+
+   uint32_t depth = 1;
+   if (range->layerCount > 1) {
+      depth = range->layerCount;
+   } else if (image->extent.depth > 1) {
+      depth = image->extent.depth;
+   }
+
+   const struct isl_extent3d image_align_sa =
+      isl_surf_get_image_alignment_sa(&surface->isl);
+
+   const struct GENX(RENDER_SURFACE_STATE) template = {
+      .SurfaceType = anv_surftype(image, pCreateInfo->viewType, false),
+      .SurfaceArray = image->array_size > 1,
+      .SurfaceFormat = iview->format,
+      .SurfaceVerticalAlignment = anv_valign[image_align_sa.height],
+      .SurfaceHorizontalAlignment = anv_halign[image_align_sa.width],
+
+      /* From bspec (DevSNB, DevIVB): "Set Tile Walk to TILEWALK_XMAJOR if
+       * Tiled Surface is False."
+       */
+      .TiledSurface = surface->isl.tiling != ISL_TILING_LINEAR,
+      .TileWalk = surface->isl.tiling == ISL_TILING_Y0 ?
+                  TILEWALK_YMAJOR : TILEWALK_XMAJOR,
+
+      .VerticalLineStride = 0,
+      .VerticalLineStrideOffset = 0,
+
+      .RenderCacheReadWriteMode = 0, /* TEMPLATE */
+
+      .Height = image->extent.height - 1,
+      .Width = image->extent.width - 1,
+      .Depth = depth - 1,
+      .SurfacePitch = surface->isl.row_pitch - 1,
+      .MinimumArrayElement = range->baseArrayLayer,
+      .NumberofMultisamples = MULTISAMPLECOUNT_1,
+      .XOffset = 0,
+      .YOffset = 0,
+
+      .SurfaceObjectControlState = GENX(MOCS),
+
+      .MIPCountLOD = 0, /* TEMPLATE */
+      .SurfaceMinLOD = 0, /* TEMPLATE */
+
+      .MCSEnable = false,
+#  if (ANV_IS_HASWELL)
+      .ShaderChannelSelectR = vk_to_gen_swizzle(pCreateInfo->components.r,
+                                                VK_COMPONENT_SWIZZLE_R),
+      .ShaderChannelSelectG = vk_to_gen_swizzle(pCreateInfo->components.g,
+                                                VK_COMPONENT_SWIZZLE_G),
+      .ShaderChannelSelectB = vk_to_gen_swizzle(pCreateInfo->components.b,
+                                                VK_COMPONENT_SWIZZLE_B),
+      .ShaderChannelSelectA = vk_to_gen_swizzle(pCreateInfo->components.a,
+                                                VK_COMPONENT_SWIZZLE_A),
+#  else /* XXX: Seriously? */
+      .RedClearColor = 0,
+      .GreenClearColor = 0,
+      .BlueClearColor = 0,
+      .AlphaClearColor = 0,
+#  endif
+      .ResourceMinLOD = 0.0,
+      .SurfaceBaseAddress = { NULL, iview->offset },
+   };
+
+   if (image->needs_nonrt_surface_state) {
+      struct GENX(RENDER_SURFACE_STATE) surface_state = template;
+
+      iview->nonrt_surface_state = alloc_surface_state(device, cmd_buffer);
+
+      surface_state.RenderCacheReadWriteMode = false;
+
+      /* For non render target surfaces, the hardware interprets field
+       * MIPCount/LOD as MIPCount.  The range of levels accessible by the
+       * sampler engine is [SurfaceMinLOD, SurfaceMinLOD + MIPCountLOD].
+       */
+      surface_state.SurfaceMinLOD = range->baseMipLevel;
+      surface_state.MIPCountLOD = MAX2(range->levelCount, 1) - 1;
+
+      GENX(RENDER_SURFACE_STATE_pack)(NULL, iview->nonrt_surface_state.map,
+                                      &surface_state);
+
+      if (!device->info.has_llc)
+         anv_state_clflush(iview->nonrt_surface_state);
+   } else {
+      iview->nonrt_surface_state.alloc_size = 0;
+   }
+
+   if (image->needs_color_rt_surface_state) {
+      struct GENX(RENDER_SURFACE_STATE) surface_state = template;
+
+      iview->color_rt_surface_state = alloc_surface_state(device, cmd_buffer);
+
+      surface_state.RenderCacheReadWriteMode = 0; /* Write only */
+
+      /* For render target surfaces, the hardware interprets field MIPCount/LOD as
+       * LOD. The Broadwell PRM says:
+       *
+       *    MIPCountLOD defines the LOD that will be rendered into.
+       *    SurfaceMinLOD is ignored.
+       */
+      surface_state.MIPCountLOD = range->baseMipLevel;
+      surface_state.SurfaceMinLOD = 0;
+
+      GENX(RENDER_SURFACE_STATE_pack)(NULL, iview->color_rt_surface_state.map,
+                                      &surface_state);
+      if (!device->info.has_llc)
+         anv_state_clflush(iview->color_rt_surface_state);
+   } else {
+      iview->color_rt_surface_state.alloc_size = 0;
+   }
+
+   if (image->needs_storage_surface_state) {
+      struct GENX(RENDER_SURFACE_STATE) surface_state = template;
+
+      iview->storage_surface_state = alloc_surface_state(device, cmd_buffer);
+
+      surface_state.SurfaceType =
+         anv_surftype(image, pCreateInfo->viewType, true),
+
+      surface_state.SurfaceFormat =
+         isl_lower_storage_image_format(&device->isl_dev, iview->format);
+
+      surface_state.SurfaceMinLOD = range->baseMipLevel;
+      surface_state.MIPCountLOD = MAX2(range->levelCount, 1) - 1;
+
+      GENX(RENDER_SURFACE_STATE_pack)(NULL, iview->storage_surface_state.map,
+                                      &surface_state);
+   } else {
+      iview->storage_surface_state.alloc_size = 0;
+   }
+}
diff --git a/src/vulkan/gen8_cmd_buffer.c b/src/vulkan/gen8_cmd_buffer.c

new file mode 100644 (file)

index 0000000..096ced5
--- /dev/null
+++ b/src/vulkan/gen8_cmd_buffer.c
@@ -0,0 +1,1255 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "gen8_pack.h"
+#include "gen9_pack.h"
+
+static uint32_t
+cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+   static const uint32_t push_constant_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 21,
+      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 22,
+      [MESA_SHADER_FRAGMENT]                    = 23,
+      [MESA_SHADER_COMPUTE]                     = 0,
+   };
+
+   VkShaderStageFlags flushed = 0;
+
+   anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
+      if (stage == MESA_SHADER_COMPUTE)
+         continue;
+
+      struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
+
+      if (state.offset == 0)
+         continue;
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS),
+                     ._3DCommandSubOpcode = push_constant_opcodes[stage],
+                     .ConstantBody = {
+                        .PointerToConstantBuffer0 = { .offset = state.offset },
+                        .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
+                     });
+
+      flushed |= mesa_to_vk_shader_stage(stage);
+   }
+
+   cmd_buffer->state.push_constants_dirty &= ~flushed;
+
+   return flushed;
+}
+
+#if ANV_GEN == 8
+static void
+emit_viewport_state(struct anv_cmd_buffer *cmd_buffer,
+                    uint32_t count, const VkViewport *viewports)
+{
+   struct anv_state sf_clip_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
+   struct anv_state cc_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
+
+   for (uint32_t i = 0; i < count; i++) {
+      const VkViewport *vp = &viewports[i];
+
+      /* The gen7 state struct has just the matrix and guardband fields, the
+       * gen8 struct adds the min/max viewport fields. */
+      struct GENX(SF_CLIP_VIEWPORT) sf_clip_viewport = {
+         .ViewportMatrixElementm00 = vp->width / 2,
+         .ViewportMatrixElementm11 = vp->height / 2,
+         .ViewportMatrixElementm22 = 1.0,
+         .ViewportMatrixElementm30 = vp->x + vp->width / 2,
+         .ViewportMatrixElementm31 = vp->y + vp->height / 2,
+         .ViewportMatrixElementm32 = 0.0,
+         .XMinClipGuardband = -1.0f,
+         .XMaxClipGuardband = 1.0f,
+         .YMinClipGuardband = -1.0f,
+         .YMaxClipGuardband = 1.0f,
+         .XMinViewPort = vp->x,
+         .XMaxViewPort = vp->x + vp->width - 1,
+         .YMinViewPort = vp->y,
+         .YMaxViewPort = vp->y + vp->height - 1,
+      };
+
+      struct GENX(CC_VIEWPORT) cc_viewport = {
+         .MinimumDepth = vp->minDepth,
+         .MaximumDepth = vp->maxDepth
+      };
+
+      GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64,
+                                 &sf_clip_viewport);
+      GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
+   }
+
+   if (!cmd_buffer->device->info.has_llc) {
+      anv_state_clflush(sf_clip_state);
+      anv_state_clflush(cc_state);
+   }
+
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC),
+                  .CCViewportPointer = cc_state.offset);
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP),
+                  .SFClipViewportPointer = sf_clip_state.offset);
+}
+
+void
+gen8_cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (cmd_buffer->state.dynamic.viewport.count > 0) {
+      emit_viewport_state(cmd_buffer, cmd_buffer->state.dynamic.viewport.count,
+                          cmd_buffer->state.dynamic.viewport.viewports);
+   } else {
+      /* If viewport count is 0, this is taken to mean "use the default" */
+      emit_viewport_state(cmd_buffer, 1,
+                          &(VkViewport) {
+                             .x = 0.0f,
+                             .y = 0.0f,
+                             .width = cmd_buffer->state.framebuffer->width,
+                             .height = cmd_buffer->state.framebuffer->height,
+                             .minDepth = 0.0f,
+                             .maxDepth = 1.0f,
+                          });
+   }
+}
+#endif
+
+static void
+emit_lrm(struct anv_batch *batch,
+         uint32_t reg, struct anv_bo *bo, uint32_t offset)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
+                  .RegisterAddress = reg,
+                  .MemoryAddress = { bo, offset });
+}
+
+static void
+emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM),
+                  .RegisterOffset = reg,
+                  .DataDWord = imm);
+}
+
+#define GEN8_L3CNTLREG                  0x7034
+
+static void
+config_l3(struct anv_cmd_buffer *cmd_buffer, bool enable_slm)
+{
+   /* References for GL state:
+    *
+    * - commits e307cfa..228d5a3
+    * - src/mesa/drivers/dri/i965/gen7_l3_state.c
+    */
+
+   uint32_t val = enable_slm ?
+      /* All = 48 ways; URB = 16 ways; DC and RO = 0, SLM = 1 */
+      0x60000021 :
+      /* All = 48 ways; URB = 48 ways; DC, RO and SLM = 0 */
+      0x60000060;
+   bool changed = cmd_buffer->state.current_l3_config != val;
+
+   if (changed) {
+      /* According to the hardware docs, the L3 partitioning can only be changed
+       * while the pipeline is completely drained and the caches are flushed,
+       * which involves a first PIPE_CONTROL flush which stalls the pipeline and
+       * initiates invalidation of the relevant caches...
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                     .TextureCacheInvalidationEnable = true,
+                     .ConstantCacheInvalidationEnable = true,
+                     .InstructionCacheInvalidateEnable = true,
+                     .DCFlushEnable = true,
+                     .PostSyncOperation = NoWrite,
+                     .CommandStreamerStallEnable = true);
+
+      /* ...followed by a second stalling flush which guarantees that
+       * invalidation is complete when the L3 configuration registers are
+       * modified.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                     .DCFlushEnable = true,
+                     .PostSyncOperation = NoWrite,
+                     .CommandStreamerStallEnable = true);
+
+      emit_lri(&cmd_buffer->batch, GEN8_L3CNTLREG, val);
+      cmd_buffer->state.current_l3_config = val;
+   }
+}
+
+static void
+flush_pipeline_select_3d(struct anv_cmd_buffer *cmd_buffer)
+{
+   config_l3(cmd_buffer, false);
+
+   if (cmd_buffer->state.current_pipeline != _3D) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT),
+#if ANV_GEN >= 9
+                     .MaskBits = 3,
+#endif
+                     .PipelineSelection = _3D);
+      cmd_buffer->state.current_pipeline = _3D;
+   }
+}
+
+static void
+cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   uint32_t *p;
+
+   uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
+
+   assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
+
+   flush_pipeline_select_3d(cmd_buffer);
+
+   if (vb_emit) {
+      const uint32_t num_buffers = __builtin_popcount(vb_emit);
+      const uint32_t num_dwords = 1 + num_buffers * 4;
+
+      p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                          GENX(3DSTATE_VERTEX_BUFFERS));
+      uint32_t vb, i = 0;
+      for_each_bit(vb, vb_emit) {
+         struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
+         uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
+
+         struct GENX(VERTEX_BUFFER_STATE) state = {
+            .VertexBufferIndex = vb,
+            .MemoryObjectControlState = GENX(MOCS),
+            .AddressModifyEnable = true,
+            .BufferPitch = pipeline->binding_stride[vb],
+            .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
+            .BufferSize = buffer->size - offset
+         };
+
+         GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
+         i++;
+      }
+   }
+
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      /* If somebody compiled a pipeline after starting a command buffer the
+       * scratch bo may have grown since we started this cmd buffer (and
+       * emitted STATE_BASE_ADDRESS).  If we're binding that pipeline now,
+       * reemit STATE_BASE_ADDRESS so that we use the bigger scratch bo. */
+      if (cmd_buffer->state.scratch_size < pipeline->total_scratch)
+         anv_cmd_buffer_emit_state_base_address(cmd_buffer);
+
+      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
+   }
+
+   /* We emit the binding tables and sampler tables first, then emit push
+    * constants and then finally emit binding table and sampler table
+    * pointers.  It has to happen in this order, since emitting the binding
+    * tables may change the push constants (in case of storage images). After
+    * emitting push constants, on SKL+ we have to emit the corresponding
+    * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
+    */
+   uint32_t dirty = 0;
+   if (cmd_buffer->state.descriptors_dirty)
+      dirty = gen7_cmd_buffer_flush_descriptor_sets(cmd_buffer);
+
+   if (cmd_buffer->state.push_constants_dirty)
+      dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
+
+   if (dirty)
+      gen7_cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
+
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
+      gen8_cmd_buffer_emit_viewport(cmd_buffer);
+
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
+      gen7_cmd_buffer_emit_scissor(cmd_buffer);
+
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)) {
+      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
+      struct GENX(3DSTATE_SF) sf = {
+         GENX(3DSTATE_SF_header),
+         .LineWidth = cmd_buffer->state.dynamic.line_width,
+      };
+      GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
+      /* FIXME: gen9.fs */
+      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gen8.sf);
+   }
+
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)){
+      bool enable_bias = cmd_buffer->state.dynamic.depth_bias.bias != 0.0f ||
+         cmd_buffer->state.dynamic.depth_bias.slope != 0.0f;
+
+      uint32_t raster_dw[GENX(3DSTATE_RASTER_length)];
+      struct GENX(3DSTATE_RASTER) raster = {
+         GENX(3DSTATE_RASTER_header),
+         .GlobalDepthOffsetEnableSolid = enable_bias,
+         .GlobalDepthOffsetEnableWireframe = enable_bias,
+         .GlobalDepthOffsetEnablePoint = enable_bias,
+         .GlobalDepthOffsetConstant = cmd_buffer->state.dynamic.depth_bias.bias,
+         .GlobalDepthOffsetScale = cmd_buffer->state.dynamic.depth_bias.slope,
+         .GlobalDepthOffsetClamp = cmd_buffer->state.dynamic.depth_bias.clamp
+      };
+      GENX(3DSTATE_RASTER_pack)(NULL, raster_dw, &raster);
+      anv_batch_emit_merge(&cmd_buffer->batch, raster_dw,
+                           pipeline->gen8.raster);
+   }
+
+   /* Stencil reference values moved from COLOR_CALC_STATE in gen8 to
+    * 3DSTATE_WM_DEPTH_STENCIL in gen9. That means the dirty bits gets split
+    * across different state packets for gen8 and gen9. We handle that by
+    * using a big old #if switch here.
+    */
+#if ANV_GEN == 8
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            GEN8_COLOR_CALC_STATE_length * 4,
+                                            64);
+      struct GEN8_COLOR_CALC_STATE cc = {
+         .BlendConstantColorRed = cmd_buffer->state.dynamic.blend_constants[0],
+         .BlendConstantColorGreen = cmd_buffer->state.dynamic.blend_constants[1],
+         .BlendConstantColorBlue = cmd_buffer->state.dynamic.blend_constants[2],
+         .BlendConstantColorAlpha = cmd_buffer->state.dynamic.blend_constants[3],
+         .StencilReferenceValue =
+            cmd_buffer->state.dynamic.stencil_reference.front,
+         .BackFaceStencilReferenceValue =
+            cmd_buffer->state.dynamic.stencil_reference.back,
+      };
+      GEN8_COLOR_CALC_STATE_pack(NULL, cc_state.map, &cc);
+
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(cc_state);
+
+      anv_batch_emit(&cmd_buffer->batch,
+                     GEN8_3DSTATE_CC_STATE_POINTERS,
+                     .ColorCalcStatePointer = cc_state.offset,
+                     .ColorCalcStatePointerValid = true);
+   }
+
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
+      uint32_t wm_depth_stencil_dw[GEN8_3DSTATE_WM_DEPTH_STENCIL_length];
+
+      struct GEN8_3DSTATE_WM_DEPTH_STENCIL wm_depth_stencil = {
+         GEN8_3DSTATE_WM_DEPTH_STENCIL_header,
+
+         /* Is this what we need to do? */
+         .StencilBufferWriteEnable =
+            cmd_buffer->state.dynamic.stencil_write_mask.front != 0,
+
+         .StencilTestMask =
+            cmd_buffer->state.dynamic.stencil_compare_mask.front & 0xff,
+         .StencilWriteMask =
+            cmd_buffer->state.dynamic.stencil_write_mask.front & 0xff,
+
+         .BackfaceStencilTestMask =
+            cmd_buffer->state.dynamic.stencil_compare_mask.back & 0xff,
+         .BackfaceStencilWriteMask =
+            cmd_buffer->state.dynamic.stencil_write_mask.back & 0xff,
+      };
+      GEN8_3DSTATE_WM_DEPTH_STENCIL_pack(NULL, wm_depth_stencil_dw,
+                                         &wm_depth_stencil);
+
+      anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
+                           pipeline->gen8.wm_depth_stencil);
+   }
+#else
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            GEN9_COLOR_CALC_STATE_length * 4,
+                                            64);
+      struct GEN9_COLOR_CALC_STATE cc = {
+         .BlendConstantColorRed = cmd_buffer->state.dynamic.blend_constants[0],
+         .BlendConstantColorGreen = cmd_buffer->state.dynamic.blend_constants[1],
+         .BlendConstantColorBlue = cmd_buffer->state.dynamic.blend_constants[2],
+         .BlendConstantColorAlpha = cmd_buffer->state.dynamic.blend_constants[3],
+      };
+      GEN9_COLOR_CALC_STATE_pack(NULL, cc_state.map, &cc);
+
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(cc_state);
+
+      anv_batch_emit(&cmd_buffer->batch,
+                     GEN9_3DSTATE_CC_STATE_POINTERS,
+                     .ColorCalcStatePointer = cc_state.offset,
+                     .ColorCalcStatePointerValid = true);
+   }
+
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
+      uint32_t dwords[GEN9_3DSTATE_WM_DEPTH_STENCIL_length];
+      struct anv_dynamic_state *d = &cmd_buffer->state.dynamic;
+      struct GEN9_3DSTATE_WM_DEPTH_STENCIL wm_depth_stencil = {
+         GEN9_3DSTATE_WM_DEPTH_STENCIL_header,
+
+         .StencilBufferWriteEnable = d->stencil_write_mask.front != 0,
+
+         .StencilTestMask = d->stencil_compare_mask.front & 0xff,
+         .StencilWriteMask = d->stencil_write_mask.front & 0xff,
+
+         .BackfaceStencilTestMask = d->stencil_compare_mask.back & 0xff,
+         .BackfaceStencilWriteMask = d->stencil_write_mask.back & 0xff,
+
+         .StencilReferenceValue = d->stencil_reference.front,
+         .BackfaceStencilReferenceValue = d->stencil_reference.back
+      };
+      GEN9_3DSTATE_WM_DEPTH_STENCIL_pack(NULL, dwords, &wm_depth_stencil);
+
+      anv_batch_emit_merge(&cmd_buffer->batch, dwords,
+                           pipeline->gen9.wm_depth_stencil);
+   }
+#endif
+
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_INDEX_BUFFER)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF),
+         .IndexedDrawCutIndexEnable = pipeline->primitive_restart,
+         .CutIndex = cmd_buffer->state.restart_index,
+      );
+   }
+
+   cmd_buffer->state.vb_dirty &= ~vb_emit;
+   cmd_buffer->state.dirty = 0;
+}
+
+void genX(CmdDraw)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    vertexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstVertex,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer_flush_state(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
+                  .VertexAccessType = SEQUENTIAL,
+                  .VertexCountPerInstance = vertexCount,
+                  .StartVertexLocation = firstVertex,
+                  .InstanceCount = instanceCount,
+                  .StartInstanceLocation = firstInstance,
+                  .BaseVertexLocation = 0);
+}
+
+void genX(CmdDrawIndexed)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    indexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstIndex,
+    int32_t                                     vertexOffset,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer_flush_state(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
+                  .VertexAccessType = RANDOM,
+                  .VertexCountPerInstance = indexCount,
+                  .StartVertexLocation = firstIndex,
+                  .InstanceCount = instanceCount,
+                  .StartInstanceLocation = firstInstance,
+                  .BaseVertexLocation = vertexOffset);
+}
+
+/* Auto-Draw / Indirect Registers */
+#define GEN7_3DPRIM_END_OFFSET          0x2420
+#define GEN7_3DPRIM_START_VERTEX        0x2430
+#define GEN7_3DPRIM_VERTEX_COUNT        0x2434
+#define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
+#define GEN7_3DPRIM_START_INSTANCE      0x243C
+#define GEN7_3DPRIM_BASE_VERTEX         0x2440
+
+void genX(CmdDrawIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   cmd_buffer_flush_state(cmd_buffer);
+
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
+   emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
+                  .IndirectParameterEnable = true,
+                  .VertexAccessType = SEQUENTIAL);
+}
+
+void genX(CmdBindIndexBuffer)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkIndexType                                 indexType)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   static const uint32_t vk_to_gen_index_type[] = {
+      [VK_INDEX_TYPE_UINT16]                    = INDEX_WORD,
+      [VK_INDEX_TYPE_UINT32]                    = INDEX_DWORD,
+   };
+
+   static const uint32_t restart_index_for_type[] = {
+      [VK_INDEX_TYPE_UINT16]                    = UINT16_MAX,
+      [VK_INDEX_TYPE_UINT32]                    = UINT32_MAX,
+   };
+
+   cmd_buffer->state.restart_index = restart_index_for_type[indexType];
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER),
+                  .IndexFormat = vk_to_gen_index_type[indexType],
+                  .MemoryObjectControlState = GENX(MOCS),
+                  .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
+                  .BufferSize = buffer->size - offset);
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
+}
+
+static VkResult
+flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   struct anv_state surfaces = { 0, }, samplers = { 0, };
+   VkResult result;
+
+   result = anv_cmd_buffer_emit_samplers(cmd_buffer,
+                                         MESA_SHADER_COMPUTE, &samplers);
+   if (result != VK_SUCCESS)
+      return result;
+   result = anv_cmd_buffer_emit_binding_table(cmd_buffer,
+                                              MESA_SHADER_COMPUTE, &surfaces);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct anv_state push_state = anv_cmd_buffer_cs_push_constants(cmd_buffer);
+
+   const struct brw_cs_prog_data *cs_prog_data = &pipeline->cs_prog_data;
+   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+
+   unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
+   unsigned push_constant_data_size =
+      (prog_data->nr_params + local_id_dwords) * 4;
+   unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
+   unsigned push_constant_regs = reg_aligned_constant_size / 32;
+
+   if (push_state.alloc_size) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD),
+                     .CURBETotalDataLength = push_state.alloc_size,
+                     .CURBEDataStartAddress = push_state.offset);
+   }
+
+   assert(prog_data->total_shared <= 64 * 1024);
+   uint32_t slm_size = 0;
+   if (prog_data->total_shared > 0) {
+      /* slm_size is in 4k increments, but must be a power of 2. */
+      slm_size = 4 * 1024;
+      while (slm_size < prog_data->total_shared)
+         slm_size <<= 1;
+      slm_size /= 4 * 1024;
+   }
+
+   struct anv_state state =
+      anv_state_pool_emit(&device->dynamic_state_pool,
+                          GENX(INTERFACE_DESCRIPTOR_DATA), 64,
+                          .KernelStartPointer = pipeline->cs_simd,
+                          .KernelStartPointerHigh = 0,
+                          .BindingTablePointer = surfaces.offset,
+                          .BindingTableEntryCount = 0,
+                          .SamplerStatePointer = samplers.offset,
+                          .SamplerCount = 0,
+                          .ConstantIndirectURBEntryReadLength = push_constant_regs,
+                          .ConstantURBEntryReadOffset = 0,
+                          .BarrierEnable = cs_prog_data->uses_barrier,
+                          .SharedLocalMemorySize = slm_size,
+                          .NumberofThreadsinGPGPUThreadGroup =
+                             pipeline->cs_thread_width_max);
+
+   uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD),
+                  .InterfaceDescriptorTotalLength = size,
+                  .InterfaceDescriptorDataStartAddress = state.offset);
+
+   return VK_SUCCESS;
+}
+
+static void
+cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   VkResult result;
+
+   assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
+
+   bool needs_slm = pipeline->cs_prog_data.base.total_shared > 0;
+   config_l3(cmd_buffer, needs_slm);
+
+   if (cmd_buffer->state.current_pipeline != GPGPU) {
+#if ANV_GEN < 10
+      /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
+       *
+       *   Software must clear the COLOR_CALC_STATE Valid field in
+       *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
+       *   with Pipeline Select set to GPGPU.
+       *
+       * The internal hardware docs recommend the same workaround for Gen9
+       * hardware too.
+       */
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_CC_STATE_POINTERS));
+#endif
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT),
+#if ANV_GEN >= 9
+                     .MaskBits = 3,
+#endif
+                     .PipelineSelection = GPGPU);
+      cmd_buffer->state.current_pipeline = GPGPU;
+   }
+
+   if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)
+      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
+
+   if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
+       (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
+      result = flush_compute_descriptor_set(cmd_buffer);
+      assert(result == VK_SUCCESS);
+      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   cmd_buffer->state.compute_dirty = 0;
+}
+
+void genX(CmdDrawIndexedIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   cmd_buffer_flush_state(cmd_buffer);
+
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
+                  .IndirectParameterEnable = true,
+                  .VertexAccessType = RANDOM);
+}
+
+void genX(CmdDispatch)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    x,
+    uint32_t                                    y,
+    uint32_t                                    z)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+
+   if (prog_data->uses_num_work_groups) {
+      struct anv_state state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
+      uint32_t *sizes = state.map;
+      sizes[0] = x;
+      sizes[1] = y;
+      sizes[2] = z;
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(state);
+      cmd_buffer->state.num_workgroups_offset = state.offset;
+      cmd_buffer->state.num_workgroups_bo =
+         &cmd_buffer->device->dynamic_state_block_pool.bo;
+   }
+
+   cmd_buffer_flush_compute_state(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
+                  .SIMDSize = prog_data->simd_size / 16,
+                  .ThreadDepthCounterMaximum = 0,
+                  .ThreadHeightCounterMaximum = 0,
+                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
+                  .ThreadGroupIDXDimension = x,
+                  .ThreadGroupIDYDimension = y,
+                  .ThreadGroupIDZDimension = z,
+                  .RightExecutionMask = pipeline->cs_right_mask,
+                  .BottomExecutionMask = 0xffffffff);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
+}
+
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+void genX(CmdDispatchIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   if (prog_data->uses_num_work_groups) {
+      cmd_buffer->state.num_workgroups_offset = bo_offset;
+      cmd_buffer->state.num_workgroups_bo = bo;
+   }
+
+   cmd_buffer_flush_compute_state(cmd_buffer);
+
+   emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
+   emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
+   emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
+                  .IndirectParameterEnable = true,
+                  .SIMDSize = prog_data->simd_size / 16,
+                  .ThreadDepthCounterMaximum = 0,
+                  .ThreadHeightCounterMaximum = 0,
+                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
+                  .RightExecutionMask = pipeline->cs_right_mask,
+                  .BottomExecutionMask = 0xffffffff);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
+}
+
+static void
+cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+   const struct anv_image_view *iview =
+      anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
+   const struct anv_image *image = iview ? iview->image : NULL;
+
+   /* XXX: isl needs to grow depth format support */
+   const struct anv_format *anv_format =
+      iview ? anv_format_for_vk_format(iview->vk_format) : NULL;
+
+   const bool has_depth = iview && anv_format->depth_format;
+   const bool has_stencil = iview && anv_format->has_stencil;
+
+   /* FIXME: Implement the PMA stall W/A */
+   /* FIXME: Width and Height are wrong */
+
+   /* Emit 3DSTATE_DEPTH_BUFFER */
+   if (has_depth) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
+         .SurfaceType = SURFTYPE_2D,
+         .DepthWriteEnable = anv_format->depth_format,
+         .StencilWriteEnable = has_stencil,
+         .HierarchicalDepthBufferEnable = false,
+         .SurfaceFormat = anv_format->depth_format,
+         .SurfacePitch = image->depth_surface.isl.row_pitch - 1,
+         .SurfaceBaseAddress = {
+            .bo = image->bo,
+            .offset = image->depth_surface.offset,
+         },
+         .Height = fb->height - 1,
+         .Width = fb->width - 1,
+         .LOD = 0,
+         .Depth = 1 - 1,
+         .MinimumArrayElement = 0,
+         .DepthBufferObjectControlState = GENX(MOCS),
+         .RenderTargetViewExtent = 1 - 1,
+         .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2);
+   } else {
+      /* Even when no depth buffer is present, the hardware requires that
+       * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
+       *
+       *    If a null depth buffer is bound, the driver must instead bind depth as:
+       *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
+       *       3DSTATE_DEPTH.Width = 1
+       *       3DSTATE_DEPTH.Height = 1
+       *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
+       *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
+       *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
+       *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
+       *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
+       *
+       * The PRM is wrong, though. The width and height must be programmed to
+       * actual framebuffer's width and height, even when neither depth buffer
+       * nor stencil buffer is present.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
+         .SurfaceType = SURFTYPE_2D,
+         .SurfaceFormat = D16_UNORM,
+         .Width = fb->width - 1,
+         .Height = fb->height - 1,
+         .StencilWriteEnable = has_stencil);
+   }
+
+   /* Emit 3DSTATE_STENCIL_BUFFER */
+   if (has_stencil) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER),
+         .StencilBufferEnable = true,
+         .StencilBufferObjectControlState = GENX(MOCS),
+
+         /* Stencil buffers have strange pitch. The PRM says:
+          *
+          *    The pitch must be set to 2x the value computed based on width,
+          *    as the stencil buffer is stored with two rows interleaved.
+          */
+         .SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
+
+         .SurfaceBaseAddress = {
+            .bo = image->bo,
+            .offset = image->offset + image->stencil_surface.offset,
+         },
+         .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2);
+   } else {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER));
+   }
+
+   /* Disable hierarchial depth buffers. */
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER));
+
+   /* Clear the clear params. */
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS));
+}
+
+/**
+ * @see anv_cmd_buffer_set_subpass()
+ */
+void
+genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_subpass *subpass)
+{
+   cmd_buffer->state.subpass = subpass;
+
+   cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+
+   cmd_buffer_emit_depth_stencil(cmd_buffer);
+}
+
+void genX(CmdBeginRenderPass)(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderPassBeginInfo*                pRenderPassBegin,
+    VkSubpassContents                           contents)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
+   ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
+
+   cmd_buffer->state.framebuffer = framebuffer;
+   cmd_buffer->state.pass = pass;
+   anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
+
+   flush_pipeline_select_3d(cmd_buffer);
+
+   const VkRect2D *render_area = &pRenderPassBegin->renderArea;
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DRAWING_RECTANGLE),
+                  .ClippedDrawingRectangleYMin = render_area->offset.y,
+                  .ClippedDrawingRectangleXMin = render_area->offset.x,
+                  .ClippedDrawingRectangleYMax =
+                     render_area->offset.y + render_area->extent.height - 1,
+                  .ClippedDrawingRectangleXMax =
+                     render_area->offset.x + render_area->extent.width - 1,
+                  .DrawingRectangleOriginY = 0,
+                  .DrawingRectangleOriginX = 0);
+
+   genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
+   anv_cmd_buffer_clear_subpass(cmd_buffer);
+}
+
+void genX(CmdNextSubpass)(
+    VkCommandBuffer                             commandBuffer,
+    VkSubpassContents                           contents)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
+   genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
+   anv_cmd_buffer_clear_subpass(cmd_buffer);
+}
+
+void genX(CmdEndRenderPass)(
+    VkCommandBuffer                             commandBuffer)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   /* Emit a flushing pipe control at the end of a pass.  This is kind of a
+    * hack but it ensures that render targets always actually get written.
+    * Eventually, we should do flushing based on image format transitions
+    * or something of that nature.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                  .PostSyncOperation = NoWrite,
+                  .RenderTargetCacheFlushEnable = true,
+                  .InstructionCacheInvalidateEnable = true,
+                  .DepthCacheFlushEnable = true,
+                  .VFCacheInvalidationEnable = true,
+                  .TextureCacheInvalidationEnable = true,
+                  .CommandStreamerStallEnable = true);
+}
+
+static void
+emit_ps_depth_count(struct anv_batch *batch,
+                    struct anv_bo *bo, uint32_t offset)
+{
+   anv_batch_emit(batch, GENX(PIPE_CONTROL),
+                  .DestinationAddressType = DAT_PPGTT,
+                  .PostSyncOperation = WritePSDepthCount,
+                  .DepthStallEnable = true,
+                  .Address = { bo, offset });
+}
+
+static void
+emit_query_availability(struct anv_batch *batch,
+                        struct anv_bo *bo, uint32_t offset)
+{
+   anv_batch_emit(batch, GENX(PIPE_CONTROL),
+                  .DestinationAddressType = DAT_PPGTT,
+                  .PostSyncOperation = WriteImmediateData,
+                  .Address = { bo, offset },
+                  .ImmediateData = 1);
+}
+
+void genX(CmdBeginQuery)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query,
+    VkQueryControlFlags                         flags)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+   /* Workaround: When meta uses the pipeline with the VS disabled, it seems
+    * that the pipelining of the depth write breaks. What we see is that
+    * samples from the render pass clear leaks into the first query
+    * immediately after the clear. Doing a pipecontrol with a post-sync
+    * operation and DepthStallEnable seems to work around the issue.
+    */
+   if (cmd_buffer->state.need_query_wa) {
+      cmd_buffer->state.need_query_wa = false;
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                     .DepthCacheFlushEnable = true,
+                     .DepthStallEnable = true);
+   }
+
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
+                          query * sizeof(struct anv_query_pool_slot));
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   default:
+      unreachable("");
+   }
+}
+
+void genX(CmdEndQuery)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
+                          query * sizeof(struct anv_query_pool_slot) + 8);
+
+      emit_query_availability(&cmd_buffer->batch, &pool->bo,
+                              query * sizeof(struct anv_query_pool_slot) + 16);
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   default:
+      unreachable("");
+   }
+}
+
+#define TIMESTAMP 0x2358
+
+void genX(CmdWriteTimestamp)(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlagBits                     pipelineStage,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   uint32_t offset = query * sizeof(struct anv_query_pool_slot);
+
+   assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
+
+   switch (pipelineStage) {
+   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM),
+                     .RegisterAddress = TIMESTAMP,
+                     .MemoryAddress = { &pool->bo, offset });
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM),
+                     .RegisterAddress = TIMESTAMP + 4,
+                     .MemoryAddress = { &pool->bo, offset + 4 });
+      break;
+
+   default:
+      /* Everything else is bottom-of-pipe */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                     .DestinationAddressType = DAT_PPGTT,
+                     .PostSyncOperation = WriteTimestamp,
+                     .Address = { &pool->bo, offset });
+      break;
+   }
+
+   emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
+}
+
+#define alu_opcode(v)   __gen_field((v),  20, 31)
+#define alu_operand1(v) __gen_field((v),  10, 19)
+#define alu_operand2(v) __gen_field((v),   0,  9)
+#define alu(opcode, operand1, operand2) \
+   alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
+
+#define OPCODE_NOOP      0x000
+#define OPCODE_LOAD      0x080
+#define OPCODE_LOADINV   0x480
+#define OPCODE_LOAD0     0x081
+#define OPCODE_LOAD1     0x481
+#define OPCODE_ADD       0x100
+#define OPCODE_SUB       0x101
+#define OPCODE_AND       0x102
+#define OPCODE_OR        0x103
+#define OPCODE_XOR       0x104
+#define OPCODE_STORE     0x180
+#define OPCODE_STOREINV  0x580
+
+#define OPERAND_R0   0x00
+#define OPERAND_R1   0x01
+#define OPERAND_R2   0x02
+#define OPERAND_R3   0x03
+#define OPERAND_R4   0x04
+#define OPERAND_SRCA 0x20
+#define OPERAND_SRCB 0x21
+#define OPERAND_ACCU 0x31
+#define OPERAND_ZF   0x32
+#define OPERAND_CF   0x33
+
+#define CS_GPR(n) (0x2600 + (n) * 8)
+
+static void
+emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
+                      struct anv_bo *bo, uint32_t offset)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
+                  .RegisterAddress = reg,
+                  .MemoryAddress = { bo, offset });
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
+                  .RegisterAddress = reg + 4,
+                  .MemoryAddress = { bo, offset + 4 });
+}
+
+static void
+store_query_result(struct anv_batch *batch, uint32_t reg,
+                   struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
+{
+      anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM),
+                     .RegisterAddress = reg,
+                     .MemoryAddress = { bo, offset });
+
+      if (flags & VK_QUERY_RESULT_64_BIT)
+         anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM),
+                        .RegisterAddress = reg + 4,
+                        .MemoryAddress = { bo, offset + 4 });
+}
+
+void genX(CmdCopyQueryPoolResults)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    VkBuffer                                    destBuffer,
+    VkDeviceSize                                destOffset,
+    VkDeviceSize                                destStride,
+    VkQueryResultFlags                          flags)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
+   uint32_t slot_offset, dst_offset;
+
+   if (flags & VK_QUERY_RESULT_WAIT_BIT)
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                     .CommandStreamerStallEnable = true,
+                     .StallAtPixelScoreboard = true);
+
+   dst_offset = buffer->offset + destOffset;
+   for (uint32_t i = 0; i < queryCount; i++) {
+
+      slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
+      switch (pool->type) {
+      case VK_QUERY_TYPE_OCCLUSION:
+         emit_load_alu_reg_u64(&cmd_buffer->batch,
+                               CS_GPR(0), &pool->bo, slot_offset);
+         emit_load_alu_reg_u64(&cmd_buffer->batch,
+                               CS_GPR(1), &pool->bo, slot_offset + 8);
+
+         /* FIXME: We need to clamp the result for 32 bit. */
+
+         uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
+         dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
+         dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
+         dw[3] = alu(OPCODE_SUB, 0, 0);
+         dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
+         break;
+
+      case VK_QUERY_TYPE_TIMESTAMP:
+         emit_load_alu_reg_u64(&cmd_buffer->batch,
+                               CS_GPR(2), &pool->bo, slot_offset);
+         break;
+
+      default:
+         unreachable("unhandled query type");
+      }
+
+      store_query_result(&cmd_buffer->batch,
+                         CS_GPR(2), buffer->bo, dst_offset, flags);
+
+      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
+         emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
+                               &pool->bo, slot_offset + 16);
+         if (flags & VK_QUERY_RESULT_64_BIT)
+            store_query_result(&cmd_buffer->batch,
+                               CS_GPR(0), buffer->bo, dst_offset + 8, flags);
+         else
+            store_query_result(&cmd_buffer->batch,
+                               CS_GPR(0), buffer->bo, dst_offset + 4, flags);
+      }
+
+      dst_offset += destStride;
+   }
+}
+
+void genX(CmdSetEvent)(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     _event,
+    VkPipelineStageFlags                        stageMask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                  .DestinationAddressType = DAT_PPGTT,
+                  .PostSyncOperation = WriteImmediateData,
+                  .Address = {
+                     &cmd_buffer->device->dynamic_state_block_pool.bo,
+                     event->state.offset
+                   },
+                  .ImmediateData = VK_EVENT_SET);
+}
+
+void genX(CmdResetEvent)(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     _event,
+    VkPipelineStageFlags                        stageMask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                  .DestinationAddressType = DAT_PPGTT,
+                  .PostSyncOperation = WriteImmediateData,
+                  .Address = {
+                     &cmd_buffer->device->dynamic_state_block_pool.bo,
+                     event->state.offset
+                   },
+                  .ImmediateData = VK_EVENT_RESET);
+}
+
+void genX(CmdWaitEvents)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    eventCount,
+    const VkEvent*                              pEvents,
+    VkPipelineStageFlags                        srcStageMask,
+    VkPipelineStageFlags                        destStageMask,
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   for (uint32_t i = 0; i < eventCount; i++) {
+      ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT),
+                     .WaitMode = PollingMode,
+                     .CompareOperation = SAD_EQUAL_SDD,
+                     .SemaphoreDataDword = VK_EVENT_SET,
+                     .SemaphoreAddress = {
+                        &cmd_buffer->device->dynamic_state_block_pool.bo,
+                        event->state.offset
+                     });
+   }
+
+   genX(CmdPipelineBarrier)(commandBuffer, srcStageMask, destStageMask,
+                            false, /* byRegion */
+                            memoryBarrierCount, pMemoryBarriers,
+                            bufferMemoryBarrierCount, pBufferMemoryBarriers,
+                            imageMemoryBarrierCount, pImageMemoryBarriers);
+}
diff --git a/src/vulkan/gen8_pack.h b/src/vulkan/gen8_pack.h

new file mode 100644 (file)

index 0000000..042e029
--- /dev/null
+++ b/src/vulkan/gen8_pack.h
@@ -0,0 +1,9209 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+/* Instructions, enums and structures for BDW.
+ *
+ * This file has been generated, do not hand edit.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <assert.h>
+
+#ifndef __gen_validate_value
+#define __gen_validate_value(x)
+#endif
+
+#ifndef __gen_field_functions
+#define __gen_field_functions
+
+union __gen_value {
+   float f;
+   uint32_t dw;
+};
+
+static inline uint64_t
+__gen_mbo(uint32_t start, uint32_t end)
+{
+   return (~0ul >> (64 - (end - start + 1))) << start;
+}
+
+static inline uint64_t
+__gen_field(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   if (end - start + 1 < 64)
+      assert(v < 1ul << (end - start + 1));
+#endif
+
+   return v << start;
+}
+
+static inline uint64_t
+__gen_fixed(float v, uint32_t start, uint32_t end,
+            bool is_signed, uint32_t fract_bits)
+{
+   __gen_validate_value(v);
+
+   const float factor = (1 << fract_bits);
+
+   float max, min;
+   if (is_signed) {
+      max = ((1 << (end - start)) - 1) / factor;
+      min = -(1 << (end - start)) / factor;
+   } else {
+      max = ((1 << (end - start + 1)) - 1) / factor;
+      min = 0.0f;
+   }
+
+   if (v > max)
+      v = max;
+   else if (v < min)
+      v = min;
+
+   int32_t int_val = roundf(v * factor);
+
+   if (is_signed)
+      int_val &= (1 << (end - start + 1)) - 1;
+
+   return int_val << start;
+}
+
+static inline uint64_t
+__gen_offset(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   uint64_t mask = (~0ul >> (64 - (end - start + 1))) << start;
+
+   assert((v & ~mask) == 0);
+#endif
+
+   return v;
+}
+
+static inline uint32_t
+__gen_float(float v)
+{
+   __gen_validate_value(v);
+   return ((union __gen_value) { .f = (v) }).dw;
+}
+
+#ifndef __gen_address_type
+#error #define __gen_address_type before including this file
+#endif
+
+#ifndef __gen_user_data
+#error #define __gen_combine_address before including this file
+#endif
+
+#endif
+
+#define GEN8_3DSTATE_URB_VS_length_bias 0x00000002
+#define GEN8_3DSTATE_URB_VS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 48,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_URB_VS_length 0x00000002
+
+struct GEN8_3DSTATE_URB_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     VSURBStartingAddress;
+   uint32_t                                     VSURBEntryAllocationSize;
+   uint32_t                                     VSNumberofURBEntries;
+};
+
+static inline void
+GEN8_3DSTATE_URB_VS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN8_3DSTATE_URB_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->VSURBStartingAddress, 25, 31) |
+      __gen_field(values->VSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->VSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_VS_length_bias 0x00000002
+#define GEN8_3DSTATE_VS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 16,                  \
+   .DwordLength          =  7
+
+#define GEN8_3DSTATE_VS_length 0x00000009
+
+struct GEN8_3DSTATE_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleVertexDispatch;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         AccessesUAV;
+   bool                                         SoftwareExceptionEnable;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         StatisticsEnable;
+   bool                                         SIMD8DispatchEnable;
+   bool                                         VertexCacheDisable;
+   bool                                         FunctionEnable;
+   uint32_t                                     VertexURBEntryOutputReadOffset;
+   uint32_t                                     VertexURBEntryOutputLength;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+};
+
+static inline void
+GEN8_3DSTATE_VS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->KernelStartPointer, 6, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->SingleVertexDispatch, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->AccessesUAV, 12, 12) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   uint64_t qw4 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   dw[6] =
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 20, 24) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->MaximumNumberofThreads, 23, 31) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->SIMD8DispatchEnable, 2, 2) |
+      __gen_field(values->VertexCacheDisable, 1, 1) |
+      __gen_field(values->FunctionEnable, 0, 0) |
+      0;
+
+   dw[8] =
+      __gen_field(values->VertexURBEntryOutputReadOffset, 21, 26) |
+      __gen_field(values->VertexURBEntryOutputLength, 16, 20) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 8, 15) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+}
+
+#define GEN8_GPGPU_CSR_BASE_ADDRESS_length_bias 0x00000002
+#define GEN8_GPGPU_CSR_BASE_ADDRESS_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  1
+
+#define GEN8_GPGPU_CSR_BASE_ADDRESS_length 0x00000003
+
+struct GEN8_GPGPU_CSR_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GPGPUCSRBaseAddress;
+};
+
+static inline void
+GEN8_GPGPU_CSR_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN8_GPGPU_CSR_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->GPGPUCSRBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+}
+
+#define GEN8_MI_ATOMIC_length_bias 0x00000002
+#define GEN8_MI_ATOMIC_header                   \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 47
+
+#define GEN8_MI_ATOMIC_length 0x00000003
+
+struct GEN8_MI_ATOMIC {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     MemoryType;
+   uint32_t                                     PostSyncOperation;
+#define     DWORD                                              0
+#define     QWORD                                              1
+#define     OCTWORD                                            2
+#define     RESERVED                                           3
+   uint32_t                                     DataSize;
+   uint32_t                                     InlineData;
+   uint32_t                                     CSSTALL;
+   uint32_t                                     ReturnDataControl;
+   uint32_t                                     ATOMICOPCODE;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           MemoryAddress;
+   uint32_t                                     Operand1DataDword0;
+   uint32_t                                     Operand2DataDword0;
+   uint32_t                                     Operand1DataDword1;
+   uint32_t                                     Operand2DataDword1;
+   uint32_t                                     Operand1DataDword2;
+   uint32_t                                     Operand2DataDword2;
+   uint32_t                                     Operand1DataDword3;
+   uint32_t                                     Operand2DataDword3;
+};
+
+static inline void
+GEN8_MI_ATOMIC_pack(__gen_user_data *data, void * restrict dst,
+                    const struct GEN8_MI_ATOMIC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->MemoryType, 22, 22) |
+      __gen_field(values->PostSyncOperation, 21, 21) |
+      __gen_field(values->DataSize, 19, 20) |
+      __gen_field(values->InlineData, 18, 18) |
+      __gen_field(values->CSSTALL, 17, 17) |
+      __gen_field(values->ReturnDataControl, 16, 16) |
+      __gen_field(values->ATOMICOPCODE, 8, 15) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->MemoryAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->Operand1DataDword0, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->Operand2DataDword0, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->Operand1DataDword1, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->Operand2DataDword1, 0, 31) |
+      0;
+
+   dw[7] =
+      __gen_field(values->Operand1DataDword2, 0, 31) |
+      0;
+
+   dw[8] =
+      __gen_field(values->Operand2DataDword2, 0, 31) |
+      0;
+
+   dw[9] =
+      __gen_field(values->Operand1DataDword3, 0, 31) |
+      0;
+
+   dw[10] =
+      __gen_field(values->Operand2DataDword3, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MI_LOAD_REGISTER_REG_length_bias 0x00000002
+#define GEN8_MI_LOAD_REGISTER_REG_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 42,                  \
+   .DwordLength          =  1
+
+#define GEN8_MI_LOAD_REGISTER_REG_length 0x00000003
+
+struct GEN8_MI_LOAD_REGISTER_REG {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SourceRegisterAddress;
+   uint32_t                                     DestinationRegisterAddress;
+};
+
+static inline void
+GEN8_MI_LOAD_REGISTER_REG_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_MI_LOAD_REGISTER_REG * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SourceRegisterAddress, 2, 22) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->DestinationRegisterAddress, 2, 22) |
+      0;
+
+}
+
+#define GEN8_MI_SEMAPHORE_SIGNAL_length_bias 0x00000002
+#define GEN8_MI_SEMAPHORE_SIGNAL_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 27,                  \
+   .DwordLength          =  0
+
+#define GEN8_MI_SEMAPHORE_SIGNAL_length 0x00000002
+
+struct GEN8_MI_SEMAPHORE_SIGNAL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     PostSyncOperation;
+#define     RCS                                                0
+#define     VCS0                                               1
+#define     BCS                                                2
+#define     VECS                                               3
+#define     VCS1                                               4
+   uint32_t                                     TargetEngineSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     TargetContextID;
+};
+
+static inline void
+GEN8_MI_SEMAPHORE_SIGNAL_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_MI_SEMAPHORE_SIGNAL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->PostSyncOperation, 21, 21) |
+      __gen_field(values->TargetEngineSelect, 15, 17) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->TargetContextID, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MI_SEMAPHORE_WAIT_length_bias 0x00000002
+#define GEN8_MI_SEMAPHORE_WAIT_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 28,                  \
+   .DwordLength          =  2
+
+#define GEN8_MI_SEMAPHORE_WAIT_length 0x00000004
+
+struct GEN8_MI_SEMAPHORE_WAIT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     MemoryType;
+#define     PollingMode                                        1
+#define     SignalMode                                         0
+   uint32_t                                     WaitMode;
+#define     SAD_GREATER_THAN_SDD                               0
+#define     SAD_GREATER_THAN_OR_EQUAL_SDD                      1
+#define     SAD_LESS_THAN_SDD                                  2
+#define     SAD_LESS_THAN_OR_EQUAL_SDD                         3
+#define     SAD_EQUAL_SDD                                      4
+#define     SAD_NOT_EQUAL_SDD                                  5
+   uint32_t                                     CompareOperation;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SemaphoreDataDword;
+   __gen_address_type                           SemaphoreAddress;
+};
+
+static inline void
+GEN8_MI_SEMAPHORE_WAIT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN8_MI_SEMAPHORE_WAIT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->MemoryType, 22, 22) |
+      __gen_field(values->WaitMode, 15, 15) |
+      __gen_field(values->CompareOperation, 12, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SemaphoreDataDword, 0, 31) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SemaphoreAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN8_MI_STORE_REGISTER_MEM_length_bias 0x00000002
+#define GEN8_MI_STORE_REGISTER_MEM_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 36,                  \
+   .DwordLength          =  2
+
+#define GEN8_MI_STORE_REGISTER_MEM_length 0x00000004
+
+struct GEN8_MI_STORE_REGISTER_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN8_MI_STORE_REGISTER_MEM_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN8_MI_STORE_REGISTER_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->PredicateEnable, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterAddress, 2, 22) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN8_PIPELINE_SELECT_length_bias 0x00000001
+#define GEN8_PIPELINE_SELECT_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  1,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4
+
+#define GEN8_PIPELINE_SELECT_length 0x00000001
+
+struct GEN8_PIPELINE_SELECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+#define     _3D                                                0
+#define     Media                                              1
+#define     GPGPU                                              2
+   uint32_t                                     PipelineSelection;
+};
+
+static inline void
+GEN8_PIPELINE_SELECT_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN8_PIPELINE_SELECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->PipelineSelection, 0, 1) |
+      0;
+
+}
+
+#define GEN8_STATE_BASE_ADDRESS_length_bias 0x00000002
+#define GEN8_STATE_BASE_ADDRESS_header          \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  1,                  \
+   .DwordLength          = 14
+
+#define GEN8_STATE_BASE_ADDRESS_length 0x00000010
+
+#define GEN8_MEMORY_OBJECT_CONTROL_STATE_length 0x00000001
+
+struct GEN8_MEMORY_OBJECT_CONTROL_STATE {
+#define     UCwithFenceifcoherentcycle                         0
+#define     UCUncacheable                                      1
+#define     WT                                                 2
+#define     WB                                                 3
+   uint32_t                                     MemoryTypeLLCeLLCCacheabilityControl;
+#define     eLLCOnlywheneDRAMispresentelsegetsallocatedinLLC       0
+#define     LLCOnly                                            1
+#define     LLCeLLCAllowed                                     2
+#define     L3DefertoPATforLLCeLLCselection                    3
+   uint32_t                                     TargetCache;
+   uint32_t                                     AgeforQUADLRU;
+};
+
+static inline void
+GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN8_MEMORY_OBJECT_CONTROL_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->MemoryTypeLLCeLLCCacheabilityControl, 5, 6) |
+      __gen_field(values->TargetCache, 3, 4) |
+      __gen_field(values->AgeforQUADLRU, 0, 1) |
+      0;
+
+}
+
+struct GEN8_STATE_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GeneralStateBaseAddress;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      GeneralStateMemoryObjectControlState;
+   bool                                         GeneralStateBaseAddressModifyEnable;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      StatelessDataPortAccessMemoryObjectControlState;
+   __gen_address_type                           SurfaceStateBaseAddress;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      SurfaceStateMemoryObjectControlState;
+   bool                                         SurfaceStateBaseAddressModifyEnable;
+   __gen_address_type                           DynamicStateBaseAddress;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      DynamicStateMemoryObjectControlState;
+   bool                                         DynamicStateBaseAddressModifyEnable;
+   __gen_address_type                           IndirectObjectBaseAddress;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      IndirectObjectMemoryObjectControlState;
+   bool                                         IndirectObjectBaseAddressModifyEnable;
+   __gen_address_type                           InstructionBaseAddress;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      InstructionMemoryObjectControlState;
+   bool                                         InstructionBaseAddressModifyEnable;
+   uint32_t                                     GeneralStateBufferSize;
+   bool                                         GeneralStateBufferSizeModifyEnable;
+   uint32_t                                     DynamicStateBufferSize;
+   bool                                         DynamicStateBufferSizeModifyEnable;
+   uint32_t                                     IndirectObjectBufferSize;
+   bool                                         IndirectObjectBufferSizeModifyEnable;
+   uint32_t                                     InstructionBufferSize;
+   bool                                         InstructionBuffersizeModifyEnable;
+};
+
+static inline void
+GEN8_STATE_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN8_STATE_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_GeneralStateMemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_GeneralStateMemoryObjectControlState, &values->GeneralStateMemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(dw_GeneralStateMemoryObjectControlState, 4, 10) |
+      __gen_field(values->GeneralStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->GeneralStateBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   uint32_t dw_StatelessDataPortAccessMemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_StatelessDataPortAccessMemoryObjectControlState, &values->StatelessDataPortAccessMemoryObjectControlState);
+   dw[3] =
+      __gen_field(dw_StatelessDataPortAccessMemoryObjectControlState, 16, 22) |
+      0;
+
+   uint32_t dw_SurfaceStateMemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceStateMemoryObjectControlState, &values->SurfaceStateMemoryObjectControlState);
+   uint32_t dw4 =
+      __gen_field(dw_SurfaceStateMemoryObjectControlState, 4, 10) |
+      __gen_field(values->SurfaceStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw4 =
+      __gen_combine_address(data, &dw[4], values->SurfaceStateBaseAddress, dw4);
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   uint32_t dw_DynamicStateMemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_DynamicStateMemoryObjectControlState, &values->DynamicStateMemoryObjectControlState);
+   uint32_t dw6 =
+      __gen_field(dw_DynamicStateMemoryObjectControlState, 4, 10) |
+      __gen_field(values->DynamicStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw6 =
+      __gen_combine_address(data, &dw[6], values->DynamicStateBaseAddress, dw6);
+
+   dw[6] = qw6;
+   dw[7] = qw6 >> 32;
+
+   uint32_t dw_IndirectObjectMemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_IndirectObjectMemoryObjectControlState, &values->IndirectObjectMemoryObjectControlState);
+   uint32_t dw8 =
+      __gen_field(dw_IndirectObjectMemoryObjectControlState, 4, 10) |
+      __gen_field(values->IndirectObjectBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw8 =
+      __gen_combine_address(data, &dw[8], values->IndirectObjectBaseAddress, dw8);
+
+   dw[8] = qw8;
+   dw[9] = qw8 >> 32;
+
+   uint32_t dw_InstructionMemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_InstructionMemoryObjectControlState, &values->InstructionMemoryObjectControlState);
+   uint32_t dw10 =
+      __gen_field(dw_InstructionMemoryObjectControlState, 4, 10) |
+      __gen_field(values->InstructionBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw10 =
+      __gen_combine_address(data, &dw[10], values->InstructionBaseAddress, dw10);
+
+   dw[10] = qw10;
+   dw[11] = qw10 >> 32;
+
+   dw[12] =
+      __gen_field(values->GeneralStateBufferSize, 12, 31) |
+      __gen_field(values->GeneralStateBufferSizeModifyEnable, 0, 0) |
+      0;
+
+   dw[13] =
+      __gen_field(values->DynamicStateBufferSize, 12, 31) |
+      __gen_field(values->DynamicStateBufferSizeModifyEnable, 0, 0) |
+      0;
+
+   dw[14] =
+      __gen_field(values->IndirectObjectBufferSize, 12, 31) |
+      __gen_field(values->IndirectObjectBufferSizeModifyEnable, 0, 0) |
+      0;
+
+   dw[15] =
+      __gen_field(values->InstructionBufferSize, 12, 31) |
+      __gen_field(values->InstructionBuffersizeModifyEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN8_STATE_PREFETCH_length_bias 0x00000002
+#define GEN8_STATE_PREFETCH_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  3,                  \
+   .DwordLength          =  0
+
+#define GEN8_STATE_PREFETCH_length 0x00000002
+
+struct GEN8_STATE_PREFETCH {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           PrefetchPointer;
+   uint32_t                                     PrefetchCount;
+};
+
+static inline void
+GEN8_STATE_PREFETCH_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN8_STATE_PREFETCH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->PrefetchCount, 0, 2) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->PrefetchPointer, dw1);
+
+}
+
+#define GEN8_STATE_SIP_length_bias 0x00000002
+#define GEN8_STATE_SIP_header                   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  2,                  \
+   .DwordLength          =  1
+
+#define GEN8_STATE_SIP_length 0x00000003
+
+struct GEN8_STATE_SIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     SystemInstructionPointer;
+};
+
+static inline void
+GEN8_STATE_SIP_pack(__gen_user_data *data, void * restrict dst,
+                    const struct GEN8_STATE_SIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->SystemInstructionPointer, 4, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+}
+
+#define GEN8_SWTESS_BASE_ADDRESS_length_bias 0x00000002
+#define GEN8_SWTESS_BASE_ADDRESS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  3,                  \
+   .DwordLength          =  0
+
+#define GEN8_SWTESS_BASE_ADDRESS_length 0x00000002
+
+struct GEN8_SWTESS_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           SWTessellationBaseAddress;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      SWTessellationMemoryObjectControlState;
+};
+
+static inline void
+GEN8_SWTESS_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_SWTESS_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SWTessellationMemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SWTessellationMemoryObjectControlState, &values->SWTessellationMemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(dw_SWTessellationMemoryObjectControlState, 8, 11) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->SWTessellationBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+}
+
+#define GEN8_3DPRIMITIVE_length_bias 0x00000002
+#define GEN8_3DPRIMITIVE_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  3,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  5
+
+#define GEN8_3DPRIMITIVE_length 0x00000007
+
+struct GEN8_3DPRIMITIVE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         IndirectParameterEnable;
+   uint32_t                                     UAVCoherencyRequired;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   bool                                         EndOffsetEnable;
+#define     SEQUENTIAL                                         0
+#define     RANDOM                                             1
+   uint32_t                                     VertexAccessType;
+   uint32_t                                     PrimitiveTopologyType;
+   uint32_t                                     VertexCountPerInstance;
+   uint32_t                                     StartVertexLocation;
+   uint32_t                                     InstanceCount;
+   uint32_t                                     StartInstanceLocation;
+   uint32_t                                     BaseVertexLocation;
+};
+
+static inline void
+GEN8_3DPRIMITIVE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN8_3DPRIMITIVE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->IndirectParameterEnable, 10, 10) |
+      __gen_field(values->UAVCoherencyRequired, 9, 9) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->EndOffsetEnable, 9, 9) |
+      __gen_field(values->VertexAccessType, 8, 8) |
+      __gen_field(values->PrimitiveTopologyType, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->VertexCountPerInstance, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->StartVertexLocation, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->InstanceCount, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->StartInstanceLocation, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->BaseVertexLocation, 0, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_AA_LINE_PARAMETERS_length_bias 0x00000002
+#define GEN8_3DSTATE_AA_LINE_PARAMETERS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 10,                  \
+   .DwordLength          =  1
+
+#define GEN8_3DSTATE_AA_LINE_PARAMETERS_length 0x00000003
+
+struct GEN8_3DSTATE_AA_LINE_PARAMETERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        AAPointCoverageBias;
+   float                                        AACoverageBias;
+   float                                        AAPointCoverageSlope;
+   float                                        AACoverageSlope;
+   float                                        AAPointCoverageEndCapBias;
+   float                                        AACoverageEndCapBias;
+   float                                        AAPointCoverageEndCapSlope;
+   float                                        AACoverageEndCapSlope;
+};
+
+static inline void
+GEN8_3DSTATE_AA_LINE_PARAMETERS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN8_3DSTATE_AA_LINE_PARAMETERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AAPointCoverageBias * (1 << 8), 24, 31) |
+      __gen_field(values->AACoverageBias * (1 << 8), 16, 23) |
+      __gen_field(values->AAPointCoverageSlope * (1 << 8), 8, 15) |
+      __gen_field(values->AACoverageSlope * (1 << 8), 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->AAPointCoverageEndCapBias * (1 << 8), 24, 31) |
+      __gen_field(values->AACoverageEndCapBias * (1 << 8), 16, 23) |
+      __gen_field(values->AAPointCoverageEndCapSlope * (1 << 8), 8, 15) |
+      __gen_field(values->AACoverageEndCapSlope * (1 << 8), 0, 7) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_DS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 70
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_DS_length 0x00000000
+
+#define GEN8_BINDING_TABLE_EDIT_ENTRY_length 0x00000001
+
+struct GEN8_BINDING_TABLE_EDIT_ENTRY {
+   uint32_t                                     BindingTableIndex;
+   uint32_t                                     SurfaceStatePointer;
+};
+
+static inline void
+GEN8_BINDING_TABLE_EDIT_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                                   const struct GEN8_BINDING_TABLE_EDIT_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->BindingTableIndex, 16, 23) |
+      __gen_offset(values->SurfaceStatePointer, 0, 15) |
+      0;
+
+}
+
+struct GEN8_3DSTATE_BINDING_TABLE_EDIT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_EDIT_DS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN8_3DSTATE_BINDING_TABLE_EDIT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_GS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 68
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_GS_length 0x00000000
+
+struct GEN8_3DSTATE_BINDING_TABLE_EDIT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_EDIT_GS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN8_3DSTATE_BINDING_TABLE_EDIT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_HS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 69
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_HS_length 0x00000000
+
+struct GEN8_3DSTATE_BINDING_TABLE_EDIT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_EDIT_HS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN8_3DSTATE_BINDING_TABLE_EDIT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_PS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 71
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_PS_length 0x00000000
+
+struct GEN8_3DSTATE_BINDING_TABLE_EDIT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_EDIT_PS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN8_3DSTATE_BINDING_TABLE_EDIT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_VS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 67
+
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_VS_length 0x00000000
+
+struct GEN8_3DSTATE_BINDING_TABLE_EDIT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_EDIT_VS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN8_3DSTATE_BINDING_TABLE_EDIT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_DS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 40,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_DS_length 0x00000002
+
+struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDSBindingTable;
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_POINTERS_DS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_GS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 41,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_GS_length 0x00000002
+
+struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoGSBindingTable;
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_POINTERS_GS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoGSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_HS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 39,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_HS_length 0x00000002
+
+struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoHSBindingTable;
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_POINTERS_HS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoHSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_PS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 42,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_PS_length 0x00000002
+
+struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoPSBindingTable;
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_POINTERS_PS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoPSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 38,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS_length 0x00000002
+
+struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoVSBindingTable;
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoVSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC_length_bias 0x00000002
+#define GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 25,                  \
+   .DwordLength          =  2
+
+#define GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC_length 0x00000004
+
+struct GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BindingTablePoolBaseAddress;
+   uint32_t                                     BindingTablePoolEnable;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      SurfaceObjectControlState;
+#define     NoValidData                                        0
+   uint32_t                                     BindingTablePoolBufferSize;
+};
+
+static inline void
+GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                                           const struct GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SurfaceObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceObjectControlState, &values->SurfaceObjectControlState);
+   uint32_t dw1 =
+      __gen_field(values->BindingTablePoolEnable, 11, 11) |
+      __gen_field(dw_SurfaceObjectControlState, 0, 6) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->BindingTablePoolBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->BindingTablePoolBufferSize, 12, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_BLEND_STATE_POINTERS_length_bias 0x00000002
+#define GEN8_3DSTATE_BLEND_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 36,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_BLEND_STATE_POINTERS_length 0x00000002
+
+struct GEN8_3DSTATE_BLEND_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BlendStatePointer;
+   bool                                         BlendStatePointerValid;
+};
+
+static inline void
+GEN8_3DSTATE_BLEND_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN8_3DSTATE_BLEND_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->BlendStatePointer, 6, 31) |
+      __gen_field(values->BlendStatePointerValid, 0, 0) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_CC_STATE_POINTERS_length_bias 0x00000002
+#define GEN8_3DSTATE_CC_STATE_POINTERS_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 14,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_CC_STATE_POINTERS_length 0x00000002
+
+struct GEN8_3DSTATE_CC_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ColorCalcStatePointer;
+   bool                                         ColorCalcStatePointerValid;
+};
+
+static inline void
+GEN8_3DSTATE_CC_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN8_3DSTATE_CC_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ColorCalcStatePointer, 6, 31) |
+      __gen_field(values->ColorCalcStatePointerValid, 0, 0) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_CHROMA_KEY_length_bias 0x00000002
+#define GEN8_3DSTATE_CHROMA_KEY_header          \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  2
+
+#define GEN8_3DSTATE_CHROMA_KEY_length 0x00000004
+
+struct GEN8_3DSTATE_CHROMA_KEY {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ChromaKeyTableIndex;
+   uint32_t                                     ChromaKeyLowValue;
+   uint32_t                                     ChromaKeyHighValue;
+};
+
+static inline void
+GEN8_3DSTATE_CHROMA_KEY_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN8_3DSTATE_CHROMA_KEY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ChromaKeyTableIndex, 30, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChromaKeyLowValue, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ChromaKeyHighValue, 0, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_CLEAR_PARAMS_length_bias 0x00000002
+#define GEN8_3DSTATE_CLEAR_PARAMS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  1
+
+#define GEN8_3DSTATE_CLEAR_PARAMS_length 0x00000003
+
+struct GEN8_3DSTATE_CLEAR_PARAMS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        DepthClearValue;
+   bool                                         DepthClearValueValid;
+};
+
+static inline void
+GEN8_3DSTATE_CLEAR_PARAMS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_3DSTATE_CLEAR_PARAMS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_float(values->DepthClearValue) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DepthClearValueValid, 0, 0) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_CLIP_length_bias 0x00000002
+#define GEN8_3DSTATE_CLIP_header                \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 18,                  \
+   .DwordLength          =  2
+
+#define GEN8_3DSTATE_CLIP_length 0x00000004
+
+struct GEN8_3DSTATE_CLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     Normal                                             0
+#define     Force                                              1
+   bool                                         ForceUserClipDistanceCullTestEnableBitmask;
+#define     _8Bit                                              0
+#define     _4Bit                                              1
+   uint32_t                                     VertexSubPixelPrecisionSelect;
+   bool                                         EarlyCullEnable;
+#define     Normal                                             0
+#define     Force                                              1
+   bool                                         ForceUserClipDistanceClipTestEnableBitmask;
+#define     Normal                                             0
+#define     Force                                              1
+   bool                                         ForceClipMode;
+   bool                                         ClipperStatisticsEnable;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+   bool                                         ClipEnable;
+#define     API_OGL                                            0
+   uint32_t                                     APIMode;
+   bool                                         ViewportXYClipTestEnable;
+   bool                                         GuardbandClipTestEnable;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+#define     NORMAL                                             0
+#define     REJECT_ALL                                         3
+#define     ACCEPT_ALL                                         4
+   uint32_t                                     ClipMode;
+   bool                                         PerspectiveDivideDisable;
+   bool                                         NonPerspectiveBarycentricEnable;
+   uint32_t                                     TriangleStripListProvokingVertexSelect;
+   uint32_t                                     LineStripListProvokingVertexSelect;
+   uint32_t                                     TriangleFanProvokingVertexSelect;
+   float                                        MinimumPointWidth;
+   float                                        MaximumPointWidth;
+   bool                                         ForceZeroRTAIndexEnable;
+   uint32_t                                     MaximumVPIndex;
+};
+
+static inline void
+GEN8_3DSTATE_CLIP_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN8_3DSTATE_CLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ForceUserClipDistanceCullTestEnableBitmask, 20, 20) |
+      __gen_field(values->VertexSubPixelPrecisionSelect, 19, 19) |
+      __gen_field(values->EarlyCullEnable, 18, 18) |
+      __gen_field(values->ForceUserClipDistanceClipTestEnableBitmask, 17, 17) |
+      __gen_field(values->ForceClipMode, 16, 16) |
+      __gen_field(values->ClipperStatisticsEnable, 10, 10) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClipEnable, 31, 31) |
+      __gen_field(values->APIMode, 30, 30) |
+      __gen_field(values->ViewportXYClipTestEnable, 28, 28) |
+      __gen_field(values->GuardbandClipTestEnable, 26, 26) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 16, 23) |
+      __gen_field(values->ClipMode, 13, 15) |
+      __gen_field(values->PerspectiveDivideDisable, 9, 9) |
+      __gen_field(values->NonPerspectiveBarycentricEnable, 8, 8) |
+      __gen_field(values->TriangleStripListProvokingVertexSelect, 4, 5) |
+      __gen_field(values->LineStripListProvokingVertexSelect, 2, 3) |
+      __gen_field(values->TriangleFanProvokingVertexSelect, 0, 1) |
+      0;
+
+   dw[3] =
+      __gen_field(values->MinimumPointWidth * (1 << 3), 17, 27) |
+      __gen_field(values->MaximumPointWidth * (1 << 3), 6, 16) |
+      __gen_field(values->ForceZeroRTAIndexEnable, 5, 5) |
+      __gen_field(values->MaximumVPIndex, 0, 3) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_CONSTANT_DS_length_bias 0x00000002
+#define GEN8_3DSTATE_CONSTANT_DS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 26,                  \
+   .DwordLength          =  9
+
+#define GEN8_3DSTATE_CONSTANT_DS_length 0x0000000b
+
+#define GEN8_3DSTATE_CONSTANT_BODY_length 0x0000000a
+
+struct GEN8_3DSTATE_CONSTANT_BODY {
+   uint32_t                                     ConstantBuffer1ReadLength;
+   uint32_t                                     ConstantBuffer0ReadLength;
+   uint32_t                                     ConstantBuffer3ReadLength;
+   uint32_t                                     ConstantBuffer2ReadLength;
+   __gen_address_type                           PointerToConstantBuffer0;
+   __gen_address_type                           PointerToConstantBuffer1;
+   __gen_address_type                           PointerToConstantBuffer2;
+   __gen_address_type                           PointerToConstantBuffer3;
+};
+
+static inline void
+GEN8_3DSTATE_CONSTANT_BODY_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN8_3DSTATE_CONSTANT_BODY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ConstantBuffer1ReadLength, 16, 31) |
+      __gen_field(values->ConstantBuffer0ReadLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBuffer3ReadLength, 16, 31) |
+      __gen_field(values->ConstantBuffer2ReadLength, 0, 15) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->PointerToConstantBuffer0, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   uint32_t dw4 =
+      0;
+
+   uint64_t qw4 =
+      __gen_combine_address(data, &dw[4], values->PointerToConstantBuffer1, dw4);
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   uint32_t dw6 =
+      0;
+
+   uint64_t qw6 =
+      __gen_combine_address(data, &dw[6], values->PointerToConstantBuffer2, dw6);
+
+   dw[6] = qw6;
+   dw[7] = qw6 >> 32;
+
+   uint32_t dw8 =
+      0;
+
+   uint64_t qw8 =
+      __gen_combine_address(data, &dw[8], values->PointerToConstantBuffer3, dw8);
+
+   dw[8] = qw8;
+   dw[9] = qw8 >> 32;
+
+}
+
+struct GEN8_3DSTATE_CONSTANT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN8_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN8_3DSTATE_CONSTANT_DS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_3DSTATE_CONSTANT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN8_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN8_3DSTATE_CONSTANT_GS_length_bias 0x00000002
+#define GEN8_3DSTATE_CONSTANT_GS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 22,                  \
+   .DwordLength          =  9
+
+#define GEN8_3DSTATE_CONSTANT_GS_length 0x0000000b
+
+struct GEN8_3DSTATE_CONSTANT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN8_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN8_3DSTATE_CONSTANT_GS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_3DSTATE_CONSTANT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN8_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN8_3DSTATE_CONSTANT_HS_length_bias 0x00000002
+#define GEN8_3DSTATE_CONSTANT_HS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 25,                  \
+   .DwordLength          =  9
+
+#define GEN8_3DSTATE_CONSTANT_HS_length 0x0000000b
+
+struct GEN8_3DSTATE_CONSTANT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN8_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN8_3DSTATE_CONSTANT_HS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_3DSTATE_CONSTANT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN8_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN8_3DSTATE_CONSTANT_PS_length_bias 0x00000002
+#define GEN8_3DSTATE_CONSTANT_PS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 23,                  \
+   .DwordLength          =  9
+
+#define GEN8_3DSTATE_CONSTANT_PS_length 0x0000000b
+
+struct GEN8_3DSTATE_CONSTANT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN8_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN8_3DSTATE_CONSTANT_PS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_3DSTATE_CONSTANT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN8_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN8_3DSTATE_CONSTANT_VS_length_bias 0x00000002
+#define GEN8_3DSTATE_CONSTANT_VS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 21,                  \
+   .DwordLength          =  9
+
+#define GEN8_3DSTATE_CONSTANT_VS_length 0x0000000b
+
+struct GEN8_3DSTATE_CONSTANT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN8_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN8_3DSTATE_CONSTANT_VS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_3DSTATE_CONSTANT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN8_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN8_3DSTATE_DEPTH_BUFFER_length_bias 0x00000002
+#define GEN8_3DSTATE_DEPTH_BUFFER_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  5,                  \
+   .DwordLength          =  6
+
+#define GEN8_3DSTATE_DEPTH_BUFFER_length 0x00000008
+
+struct GEN8_3DSTATE_DEPTH_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     SURFTYPE_1D                                        0
+#define     SURFTYPE_2D                                        1
+#define     SURFTYPE_3D                                        2
+#define     SURFTYPE_CUBE                                      3
+#define     SURFTYPE_NULL                                      7
+   uint32_t                                     SurfaceType;
+   bool                                         DepthWriteEnable;
+   bool                                         StencilWriteEnable;
+   bool                                         HierarchicalDepthBufferEnable;
+#define     D32_FLOAT                                          1
+#define     D24_UNORM_X8_UINT                                  3
+#define     D16_UNORM                                          5
+   uint32_t                                     SurfaceFormat;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     Height;
+   uint32_t                                     Width;
+   uint32_t                                     LOD;
+   uint32_t                                     Depth;
+   uint32_t                                     MinimumArrayElement;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      DepthBufferObjectControlState;
+   uint32_t                                     RenderTargetViewExtent;
+   uint32_t                                     SurfaceQPitch;
+};
+
+static inline void
+GEN8_3DSTATE_DEPTH_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_3DSTATE_DEPTH_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SurfaceType, 29, 31) |
+      __gen_field(values->DepthWriteEnable, 28, 28) |
+      __gen_field(values->StencilWriteEnable, 27, 27) |
+      __gen_field(values->HierarchicalDepthBufferEnable, 22, 22) |
+      __gen_field(values->SurfaceFormat, 18, 20) |
+      __gen_field(values->SurfacePitch, 0, 17) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->Height, 18, 31) |
+      __gen_field(values->Width, 4, 17) |
+      __gen_field(values->LOD, 0, 3) |
+      0;
+
+   uint32_t dw_DepthBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_DepthBufferObjectControlState, &values->DepthBufferObjectControlState);
+   dw[5] =
+      __gen_field(values->Depth, 21, 31) |
+      __gen_field(values->MinimumArrayElement, 10, 20) |
+      __gen_field(dw_DepthBufferObjectControlState, 0, 6) |
+      0;
+
+   dw[6] =
+      0;
+
+   dw[7] =
+      __gen_field(values->RenderTargetViewExtent, 21, 31) |
+      __gen_field(values->SurfaceQPitch, 0, 14) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_DRAWING_RECTANGLE_length_bias 0x00000002
+#define GEN8_3DSTATE_DRAWING_RECTANGLE_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  2
+
+#define GEN8_3DSTATE_DRAWING_RECTANGLE_length 0x00000004
+
+struct GEN8_3DSTATE_DRAWING_RECTANGLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+#define     Legacy                                             0
+#define     Core0Enabled                                       1
+#define     Core1Enabled                                       2
+   uint32_t                                     CoreModeSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ClippedDrawingRectangleYMin;
+   uint32_t                                     ClippedDrawingRectangleXMin;
+   uint32_t                                     ClippedDrawingRectangleYMax;
+   uint32_t                                     ClippedDrawingRectangleXMax;
+   uint32_t                                     DrawingRectangleOriginY;
+   uint32_t                                     DrawingRectangleOriginX;
+};
+
+static inline void
+GEN8_3DSTATE_DRAWING_RECTANGLE_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN8_3DSTATE_DRAWING_RECTANGLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->CoreModeSelect, 14, 15) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ClippedDrawingRectangleYMin, 16, 31) |
+      __gen_field(values->ClippedDrawingRectangleXMin, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClippedDrawingRectangleYMax, 16, 31) |
+      __gen_field(values->ClippedDrawingRectangleXMax, 0, 15) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DrawingRectangleOriginY, 16, 31) |
+      __gen_field(values->DrawingRectangleOriginX, 0, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_DS_length_bias 0x00000002
+#define GEN8_3DSTATE_DS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 29,                  \
+   .DwordLength          =  7
+
+#define GEN8_3DSTATE_DS_length 0x00000009
+
+struct GEN8_3DSTATE_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleDomainPointDispatch;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         AccessesUAV;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     PatchURBEntryReadLength;
+   uint32_t                                     PatchURBEntryReadOffset;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         StatisticsEnable;
+   bool                                         SIMD8DispatchEnable;
+   bool                                         ComputeWCoordinateEnable;
+   bool                                         CacheDisable;
+   bool                                         FunctionEnable;
+   uint32_t                                     VertexURBEntryOutputReadOffset;
+   uint32_t                                     VertexURBEntryOutputLength;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+};
+
+static inline void
+GEN8_3DSTATE_DS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->KernelStartPointer, 6, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->SingleDomainPointDispatch, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->AccessesUAV, 14, 14) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   uint64_t qw4 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   dw[6] =
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 20, 24) |
+      __gen_field(values->PatchURBEntryReadLength, 11, 17) |
+      __gen_field(values->PatchURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->MaximumNumberofThreads, 21, 29) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->SIMD8DispatchEnable, 3, 3) |
+      __gen_field(values->ComputeWCoordinateEnable, 2, 2) |
+      __gen_field(values->CacheDisable, 1, 1) |
+      __gen_field(values->FunctionEnable, 0, 0) |
+      0;
+
+   dw[8] =
+      __gen_field(values->VertexURBEntryOutputReadOffset, 21, 26) |
+      __gen_field(values->VertexURBEntryOutputLength, 16, 20) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 8, 15) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_DS_length_bias 0x00000002
+#define GEN8_3DSTATE_GATHER_CONSTANT_DS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 55
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_DS_length 0x00000000
+
+#define GEN8_GATHER_CONSTANT_ENTRY_length 0x00000001
+
+struct GEN8_GATHER_CONSTANT_ENTRY {
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ChannelMask;
+   uint32_t                                     BindingTableIndexOffset;
+};
+
+static inline void
+GEN8_GATHER_CONSTANT_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN8_GATHER_CONSTANT_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->ConstantBufferOffset, 8, 15) |
+      __gen_field(values->ChannelMask, 4, 7) |
+      __gen_field(values->BindingTableIndexOffset, 0, 3) |
+      0;
+
+}
+
+struct GEN8_3DSTATE_GATHER_CONSTANT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_GATHER_CONSTANT_DS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN8_3DSTATE_GATHER_CONSTANT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_GS_length_bias 0x00000002
+#define GEN8_3DSTATE_GATHER_CONSTANT_GS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 53
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_GS_length 0x00000000
+
+struct GEN8_3DSTATE_GATHER_CONSTANT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_GATHER_CONSTANT_GS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN8_3DSTATE_GATHER_CONSTANT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_HS_length_bias 0x00000002
+#define GEN8_3DSTATE_GATHER_CONSTANT_HS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 54
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_HS_length 0x00000000
+
+struct GEN8_3DSTATE_GATHER_CONSTANT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_GATHER_CONSTANT_HS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN8_3DSTATE_GATHER_CONSTANT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_PS_length_bias 0x00000002
+#define GEN8_3DSTATE_GATHER_CONSTANT_PS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 56
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_PS_length 0x00000000
+
+struct GEN8_3DSTATE_GATHER_CONSTANT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+   bool                                         ConstantBufferDx9Enable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_GATHER_CONSTANT_PS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN8_3DSTATE_GATHER_CONSTANT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      __gen_field(values->ConstantBufferDx9Enable, 4, 4) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_VS_length_bias 0x00000002
+#define GEN8_3DSTATE_GATHER_CONSTANT_VS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 52
+
+#define GEN8_3DSTATE_GATHER_CONSTANT_VS_length 0x00000000
+
+struct GEN8_3DSTATE_GATHER_CONSTANT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+   bool                                         ConstantBufferDx9Enable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_GATHER_CONSTANT_VS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN8_3DSTATE_GATHER_CONSTANT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      __gen_field(values->ConstantBufferDx9Enable, 4, 4) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_GATHER_POOL_ALLOC_length_bias 0x00000002
+#define GEN8_3DSTATE_GATHER_POOL_ALLOC_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 26,                  \
+   .DwordLength          =  2
+
+#define GEN8_3DSTATE_GATHER_POOL_ALLOC_length 0x00000004
+
+struct GEN8_3DSTATE_GATHER_POOL_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GatherPoolBaseAddress;
+   bool                                         GatherPoolEnable;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   uint32_t                                     GatherPoolBufferSize;
+};
+
+static inline void
+GEN8_3DSTATE_GATHER_POOL_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN8_3DSTATE_GATHER_POOL_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(values->GatherPoolEnable, 11, 11) |
+      __gen_field(dw_MemoryObjectControlState, 0, 6) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->GatherPoolBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->GatherPoolBufferSize, 12, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_GS_length_bias 0x00000002
+#define GEN8_3DSTATE_GS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 17,                  \
+   .DwordLength          =  8
+
+#define GEN8_3DSTATE_GS_length 0x0000000a
+
+struct GEN8_3DSTATE_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     KernelStartPointer;
+   uint32_t                                     SingleProgramFlow;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         AccessesUAV;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ExpectedVertexCount;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     OutputVertexSize;
+   uint32_t                                     OutputTopology;
+   uint32_t                                     VertexURBEntryReadLength;
+   bool                                         IncludeVertexHandles;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     ControlDataHeaderSize;
+   uint32_t                                     InstanceControl;
+   uint32_t                                     DefaultStreamId;
+#define     DispatchModeSingle                                 0
+#define     DispatchModeDualInstance                           1
+#define     DispatchModeDualObject                             2
+#define     DispatchModeSIMD8                                  3
+   uint32_t                                     DispatchMode;
+   bool                                         StatisticsEnable;
+   uint32_t                                     InvocationsIncrementValue;
+   bool                                         IncludePrimitiveID;
+   uint32_t                                     Hint;
+#define     LEADING                                            0
+#define     TRAILING                                           1
+   uint32_t                                     ReorderMode;
+   bool                                         DiscardAdjacency;
+   bool                                         Enable;
+#define     CUT                                                0
+#define     SID                                                1
+   uint32_t                                     ControlDataFormat;
+   bool                                         StaticOutput;
+   uint32_t                                     StaticOutputVertexCount;
+   uint32_t                                     VertexURBEntryOutputReadOffset;
+   uint32_t                                     VertexURBEntryOutputLength;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+};
+
+static inline void
+GEN8_3DSTATE_GS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->KernelStartPointer, 6, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->SingleProgramFlow, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->AccessesUAV, 12, 12) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      __gen_field(values->ExpectedVertexCount, 0, 5) |
+      0;
+
+   uint64_t qw4 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   dw[6] =
+      __gen_field(values->OutputVertexSize, 23, 28) |
+      __gen_field(values->OutputTopology, 17, 22) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->IncludeVertexHandles, 10, 10) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 0, 3) |
+      0;
+
+   dw[7] =
+      __gen_field(values->MaximumNumberofThreads, 24, 31) |
+      __gen_field(values->ControlDataHeaderSize, 20, 23) |
+      __gen_field(values->InstanceControl, 15, 19) |
+      __gen_field(values->DefaultStreamId, 13, 14) |
+      __gen_field(values->DispatchMode, 11, 12) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->InvocationsIncrementValue, 5, 9) |
+      __gen_field(values->IncludePrimitiveID, 4, 4) |
+      __gen_field(values->Hint, 3, 3) |
+      __gen_field(values->ReorderMode, 2, 2) |
+      __gen_field(values->DiscardAdjacency, 1, 1) |
+      __gen_field(values->Enable, 0, 0) |
+      0;
+
+   dw[8] =
+      __gen_field(values->ControlDataFormat, 31, 31) |
+      __gen_field(values->StaticOutput, 30, 30) |
+      __gen_field(values->StaticOutputVertexCount, 16, 26) |
+      0;
+
+   dw[9] =
+      __gen_field(values->VertexURBEntryOutputReadOffset, 21, 26) |
+      __gen_field(values->VertexURBEntryOutputLength, 16, 20) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 8, 15) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_HIER_DEPTH_BUFFER_length_bias 0x00000002
+#define GEN8_3DSTATE_HIER_DEPTH_BUFFER_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  7,                  \
+   .DwordLength          =  3
+
+#define GEN8_3DSTATE_HIER_DEPTH_BUFFER_length 0x00000005
+
+struct GEN8_3DSTATE_HIER_DEPTH_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      HierarchicalDepthBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     SurfaceQPitch;
+};
+
+static inline void
+GEN8_3DSTATE_HIER_DEPTH_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN8_3DSTATE_HIER_DEPTH_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_HierarchicalDepthBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_HierarchicalDepthBufferObjectControlState, &values->HierarchicalDepthBufferObjectControlState);
+   dw[1] =
+      __gen_field(dw_HierarchicalDepthBufferObjectControlState, 25, 31) |
+      __gen_field(values->SurfacePitch, 0, 16) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->SurfaceQPitch, 0, 14) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_HS_length_bias 0x00000002
+#define GEN8_3DSTATE_HS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 27,                  \
+   .DwordLength          =  7
+
+#define GEN8_3DSTATE_HS_length 0x00000009
+
+struct GEN8_3DSTATE_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   bool                                         Enable;
+   bool                                         StatisticsEnable;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     InstanceCount;
+   uint64_t                                     KernelStartPointer;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   bool                                         SingleProgramFlow;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+   bool                                         AccessesUAV;
+   bool                                         IncludeVertexHandles;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+};
+
+static inline void
+GEN8_3DSTATE_HS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 12, 12) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Enable, 31, 31) |
+      __gen_field(values->StatisticsEnable, 29, 29) |
+      __gen_field(values->MaximumNumberofThreads, 8, 16) |
+      __gen_field(values->InstanceCount, 0, 3) |
+      0;
+
+   uint64_t qw3 =
+      __gen_offset(values->KernelStartPointer, 6, 63) |
+      0;
+
+   dw[3] = qw3;
+   dw[4] = qw3 >> 32;
+
+   uint64_t qw5 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[5] = qw5;
+   dw[6] = qw5 >> 32;
+
+   dw[7] =
+      __gen_field(values->SingleProgramFlow, 27, 27) |
+      __gen_field(values->VectorMaskEnable, 26, 26) |
+      __gen_field(values->AccessesUAV, 25, 25) |
+      __gen_field(values->IncludeVertexHandles, 24, 24) |
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 19, 23) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[8] =
+      0;
+
+}
+
+#define GEN8_3DSTATE_INDEX_BUFFER_length_bias 0x00000002
+#define GEN8_3DSTATE_INDEX_BUFFER_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 10,                  \
+   .DwordLength          =  3
+
+#define GEN8_3DSTATE_INDEX_BUFFER_length 0x00000005
+
+struct GEN8_3DSTATE_INDEX_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     INDEX_BYTE                                         0
+#define     INDEX_WORD                                         1
+#define     INDEX_DWORD                                        2
+   uint32_t                                     IndexFormat;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   __gen_address_type                           BufferStartingAddress;
+   uint32_t                                     BufferSize;
+};
+
+static inline void
+GEN8_3DSTATE_INDEX_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_3DSTATE_INDEX_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   dw[1] =
+      __gen_field(values->IndexFormat, 8, 9) |
+      __gen_field(dw_MemoryObjectControlState, 0, 6) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->BufferStartingAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->BufferSize, 0, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_LINE_STIPPLE_length_bias 0x00000002
+#define GEN8_3DSTATE_LINE_STIPPLE_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  8,                  \
+   .DwordLength          =  1
+
+#define GEN8_3DSTATE_LINE_STIPPLE_length 0x00000003
+
+struct GEN8_3DSTATE_LINE_STIPPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ModifyEnableCurrentRepeatCounterCurrentStippleIndex;
+   uint32_t                                     CurrentRepeatCounter;
+   uint32_t                                     CurrentStippleIndex;
+   uint32_t                                     LineStipplePattern;
+   float                                        LineStippleInverseRepeatCount;
+   uint32_t                                     LineStippleRepeatCount;
+};
+
+static inline void
+GEN8_3DSTATE_LINE_STIPPLE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_3DSTATE_LINE_STIPPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ModifyEnableCurrentRepeatCounterCurrentStippleIndex, 31, 31) |
+      __gen_field(values->CurrentRepeatCounter, 21, 29) |
+      __gen_field(values->CurrentStippleIndex, 16, 19) |
+      __gen_field(values->LineStipplePattern, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->LineStippleInverseRepeatCount * (1 << 16), 15, 31) |
+      __gen_field(values->LineStippleRepeatCount, 0, 8) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_MONOFILTER_SIZE_length_bias 0x00000002
+#define GEN8_3DSTATE_MONOFILTER_SIZE_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 17,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_MONOFILTER_SIZE_length 0x00000002
+
+struct GEN8_3DSTATE_MONOFILTER_SIZE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     MonochromeFilterWidth;
+   uint32_t                                     MonochromeFilterHeight;
+};
+
+static inline void
+GEN8_3DSTATE_MONOFILTER_SIZE_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN8_3DSTATE_MONOFILTER_SIZE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MonochromeFilterWidth, 3, 5) |
+      __gen_field(values->MonochromeFilterHeight, 0, 2) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_MULTISAMPLE_length_bias 0x00000002
+#define GEN8_3DSTATE_MULTISAMPLE_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 13,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_MULTISAMPLE_length 0x00000002
+
+struct GEN8_3DSTATE_MULTISAMPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PixelPositionOffsetEnable;
+#define     CENTER                                             0
+#define     UL_CORNER                                          1
+   uint32_t                                     PixelLocation;
+   uint32_t                                     NumberofMultisamples;
+};
+
+static inline void
+GEN8_3DSTATE_MULTISAMPLE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_3DSTATE_MULTISAMPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PixelPositionOffsetEnable, 5, 5) |
+      __gen_field(values->PixelLocation, 4, 4) |
+      __gen_field(values->NumberofMultisamples, 1, 3) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_POLY_STIPPLE_OFFSET_length_bias 0x00000002
+#define GEN8_3DSTATE_POLY_STIPPLE_OFFSET_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  6,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_POLY_STIPPLE_OFFSET_length 0x00000002
+
+struct GEN8_3DSTATE_POLY_STIPPLE_OFFSET {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PolygonStippleXOffset;
+   uint32_t                                     PolygonStippleYOffset;
+};
+
+static inline void
+GEN8_3DSTATE_POLY_STIPPLE_OFFSET_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN8_3DSTATE_POLY_STIPPLE_OFFSET * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PolygonStippleXOffset, 8, 12) |
+      __gen_field(values->PolygonStippleYOffset, 0, 4) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_POLY_STIPPLE_PATTERN_length_bias 0x00000002
+#define GEN8_3DSTATE_POLY_STIPPLE_PATTERN_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  7,                  \
+   .DwordLength          = 31
+
+#define GEN8_3DSTATE_POLY_STIPPLE_PATTERN_length 0x00000021
+
+struct GEN8_3DSTATE_POLY_STIPPLE_PATTERN {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PatternRow[32];
+};
+
+static inline void
+GEN8_3DSTATE_POLY_STIPPLE_PATTERN_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN8_3DSTATE_POLY_STIPPLE_PATTERN * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 32; i += 1, j++) {
+      dw[j] =
+         __gen_field(values->PatternRow[i + 0], 0, 31) |
+         0;
+   }
+
+}
+
+#define GEN8_3DSTATE_PS_length_bias 0x00000002
+#define GEN8_3DSTATE_PS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 32,                  \
+   .DwordLength          = 10
+
+#define GEN8_3DSTATE_PS_length 0x0000000c
+
+struct GEN8_3DSTATE_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     KernelStartPointer0;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleProgramFlow;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+#define     FlushedtoZero                                      0
+#define     Retained                                           1
+   uint32_t                                     SinglePrecisionDenormalMode;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+#define     RTNE                                               0
+#define     RU                                                 1
+#define     RD                                                 2
+#define     RTZ                                                3
+   uint32_t                                     RoundingMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     MaximumNumberofThreadsPerPSD;
+   bool                                         PushConstantEnable;
+   bool                                         RenderTargetFastClearEnable;
+   bool                                         RenderTargetResolveEnable;
+#define     POSOFFSET_NONE                                     0
+#define     POSOFFSET_CENTROID                                 2
+#define     POSOFFSET_SAMPLE                                   3
+   uint32_t                                     PositionXYOffsetSelect;
+   bool                                         _32PixelDispatchEnable;
+   bool                                         _16PixelDispatchEnable;
+   bool                                         _8PixelDispatchEnable;
+   uint32_t                                     DispatchGRFStartRegisterForConstantSetupData0;
+   uint32_t                                     DispatchGRFStartRegisterForConstantSetupData1;
+   uint32_t                                     DispatchGRFStartRegisterForConstantSetupData2;
+   uint64_t                                     KernelStartPointer1;
+   uint64_t                                     KernelStartPointer2;
+};
+
+static inline void
+GEN8_3DSTATE_PS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->KernelStartPointer0, 6, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->SingleProgramFlow, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->SinglePrecisionDenormalMode, 26, 26) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->RoundingMode, 14, 15) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   uint64_t qw4 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   dw[6] =
+      __gen_field(values->MaximumNumberofThreadsPerPSD, 23, 31) |
+      __gen_field(values->PushConstantEnable, 11, 11) |
+      __gen_field(values->RenderTargetFastClearEnable, 8, 8) |
+      __gen_field(values->RenderTargetResolveEnable, 6, 6) |
+      __gen_field(values->PositionXYOffsetSelect, 3, 4) |
+      __gen_field(values->_32PixelDispatchEnable, 2, 2) |
+      __gen_field(values->_16PixelDispatchEnable, 1, 1) |
+      __gen_field(values->_8PixelDispatchEnable, 0, 0) |
+      0;
+
+   dw[7] =
+      __gen_field(values->DispatchGRFStartRegisterForConstantSetupData0, 16, 22) |
+      __gen_field(values->DispatchGRFStartRegisterForConstantSetupData1, 8, 14) |
+      __gen_field(values->DispatchGRFStartRegisterForConstantSetupData2, 0, 6) |
+      0;
+
+   uint64_t qw8 =
+      __gen_offset(values->KernelStartPointer1, 6, 63) |
+      0;
+
+   dw[8] = qw8;
+   dw[9] = qw8 >> 32;
+
+   uint64_t qw10 =
+      __gen_offset(values->KernelStartPointer2, 6, 63) |
+      0;
+
+   dw[10] = qw10;
+   dw[11] = qw10 >> 32;
+
+}
+
+#define GEN8_3DSTATE_PS_BLEND_length_bias 0x00000002
+#define GEN8_3DSTATE_PS_BLEND_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 77,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_PS_BLEND_length 0x00000002
+
+struct GEN8_3DSTATE_PS_BLEND {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         AlphaToCoverageEnable;
+   bool                                         HasWriteableRT;
+   bool                                         ColorBufferBlendEnable;
+   uint32_t                                     SourceAlphaBlendFactor;
+   uint32_t                                     DestinationAlphaBlendFactor;
+   uint32_t                                     SourceBlendFactor;
+   uint32_t                                     DestinationBlendFactor;
+   bool                                         AlphaTestEnable;
+   bool                                         IndependentAlphaBlendEnable;
+};
+
+static inline void
+GEN8_3DSTATE_PS_BLEND_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_3DSTATE_PS_BLEND * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AlphaToCoverageEnable, 31, 31) |
+      __gen_field(values->HasWriteableRT, 30, 30) |
+      __gen_field(values->ColorBufferBlendEnable, 29, 29) |
+      __gen_field(values->SourceAlphaBlendFactor, 24, 28) |
+      __gen_field(values->DestinationAlphaBlendFactor, 19, 23) |
+      __gen_field(values->SourceBlendFactor, 14, 18) |
+      __gen_field(values->DestinationBlendFactor, 9, 13) |
+      __gen_field(values->AlphaTestEnable, 8, 8) |
+      __gen_field(values->IndependentAlphaBlendEnable, 7, 7) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_PS_EXTRA_length_bias 0x00000002
+#define GEN8_3DSTATE_PS_EXTRA_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 79,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_PS_EXTRA_length 0x00000002
+
+struct GEN8_3DSTATE_PS_EXTRA {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         PixelShaderValid;
+   bool                                         PixelShaderDoesnotwritetoRT;
+   bool                                         oMaskPresenttoRenderTarget;
+   bool                                         PixelShaderKillsPixel;
+#define     PSCDEPTH_OFF                                       0
+#define     PSCDEPTH_ON                                        1
+#define     PSCDEPTH_ON_GE                                     2
+#define     PSCDEPTH_ON_LE                                     3
+   uint32_t                                     PixelShaderComputedDepthMode;
+   bool                                         ForceComputedDepth;
+   bool                                         PixelShaderUsesSourceDepth;
+   bool                                         PixelShaderUsesSourceW;
+   uint32_t                                     Removed;
+   bool                                         AttributeEnable;
+   bool                                         PixelShaderDisablesAlphaToCoverage;
+   bool                                         PixelShaderIsPerSample;
+   bool                                         PixelShaderHasUAV;
+   bool                                         PixelShaderUsesInputCoverageMask;
+};
+
+static inline void
+GEN8_3DSTATE_PS_EXTRA_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_3DSTATE_PS_EXTRA * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PixelShaderValid, 31, 31) |
+      __gen_field(values->PixelShaderDoesnotwritetoRT, 30, 30) |
+      __gen_field(values->oMaskPresenttoRenderTarget, 29, 29) |
+      __gen_field(values->PixelShaderKillsPixel, 28, 28) |
+      __gen_field(values->PixelShaderComputedDepthMode, 26, 27) |
+      __gen_field(values->ForceComputedDepth, 25, 25) |
+      __gen_field(values->PixelShaderUsesSourceDepth, 24, 24) |
+      __gen_field(values->PixelShaderUsesSourceW, 23, 23) |
+      __gen_field(values->Removed, 17, 17) |
+      __gen_field(values->AttributeEnable, 8, 8) |
+      __gen_field(values->PixelShaderDisablesAlphaToCoverage, 7, 7) |
+      __gen_field(values->PixelShaderIsPerSample, 6, 6) |
+      __gen_field(values->PixelShaderHasUAV, 2, 2) |
+      __gen_field(values->PixelShaderUsesInputCoverageMask, 1, 1) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS_length_bias 0x00000002
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 20,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS_length 0x00000002
+
+struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS_length_bias 0x00000002
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 21,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS_length 0x00000002
+
+struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS_length_bias 0x00000002
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 19,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS_length 0x00000002
+
+struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_PS_length_bias 0x00000002
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 22,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_PS_length 0x00000002
+
+struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_PS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_VS_length_bias 0x00000002
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 18,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_VS_length 0x00000002
+
+struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_VS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_RASTER_length_bias 0x00000002
+#define GEN8_3DSTATE_RASTER_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 80,                  \
+   .DwordLength          =  3
+
+#define GEN8_3DSTATE_RASTER_length 0x00000005
+
+struct GEN8_3DSTATE_RASTER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     DX9OGL                                             0
+#define     DX100                                              1
+#define     DX101                                              2
+   uint32_t                                     APIMode;
+#define     Clockwise                                          0
+#define     CounterClockwise                                   1
+   uint32_t                                     FrontWinding;
+#define     FSC_NUMRASTSAMPLES_0                               0
+#define     FSC_NUMRASTSAMPLES_1                               1
+#define     FSC_NUMRASTSAMPLES_2                               2
+#define     FSC_NUMRASTSAMPLES_4                               3
+#define     FSC_NUMRASTSAMPLES_8                               4
+#define     FSC_NUMRASTSAMPLES_16                              5
+   uint32_t                                     ForcedSampleCount;
+#define     CULLMODE_BOTH                                      0
+#define     CULLMODE_NONE                                      1
+#define     CULLMODE_FRONT                                     2
+#define     CULLMODE_BACK                                      3
+   uint32_t                                     CullMode;
+#define     Normal                                             0
+#define     Force                                              1
+   uint32_t                                     ForceMultisampling;
+   bool                                         SmoothPointEnable;
+   bool                                         DXMultisampleRasterizationEnable;
+#define     MSRASTMODE_OFF_PIXEL                               0
+#define     MSRASTMODE_OFF_PATTERN                             1
+#define     MSRASTMODE_ON_PIXEL                                2
+#define     MSRASTMODE_ON_PATTERN                              3
+   uint32_t                                     DXMultisampleRasterizationMode;
+   bool                                         GlobalDepthOffsetEnableSolid;
+   bool                                         GlobalDepthOffsetEnableWireframe;
+   bool                                         GlobalDepthOffsetEnablePoint;
+#define     RASTER_SOLID                                       0
+#define     RASTER_WIREFRAME                                   1
+#define     RASTER_POINT                                       2
+   uint32_t                                     FrontFaceFillMode;
+#define     RASTER_SOLID                                       0
+#define     RASTER_WIREFRAME                                   1
+#define     RASTER_POINT                                       2
+   uint32_t                                     BackFaceFillMode;
+   bool                                         AntialiasingEnable;
+   bool                                         ScissorRectangleEnable;
+   bool                                         ViewportZClipTestEnable;
+   float                                        GlobalDepthOffsetConstant;
+   float                                        GlobalDepthOffsetScale;
+   float                                        GlobalDepthOffsetClamp;
+};
+
+static inline void
+GEN8_3DSTATE_RASTER_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN8_3DSTATE_RASTER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->APIMode, 22, 23) |
+      __gen_field(values->FrontWinding, 21, 21) |
+      __gen_field(values->ForcedSampleCount, 18, 20) |
+      __gen_field(values->CullMode, 16, 17) |
+      __gen_field(values->ForceMultisampling, 14, 14) |
+      __gen_field(values->SmoothPointEnable, 13, 13) |
+      __gen_field(values->DXMultisampleRasterizationEnable, 12, 12) |
+      __gen_field(values->DXMultisampleRasterizationMode, 10, 11) |
+      __gen_field(values->GlobalDepthOffsetEnableSolid, 9, 9) |
+      __gen_field(values->GlobalDepthOffsetEnableWireframe, 8, 8) |
+      __gen_field(values->GlobalDepthOffsetEnablePoint, 7, 7) |
+      __gen_field(values->FrontFaceFillMode, 5, 6) |
+      __gen_field(values->BackFaceFillMode, 3, 4) |
+      __gen_field(values->AntialiasingEnable, 2, 2) |
+      __gen_field(values->ScissorRectangleEnable, 1, 1) |
+      __gen_field(values->ViewportZClipTestEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_float(values->GlobalDepthOffsetConstant) |
+      0;
+
+   dw[3] =
+      __gen_float(values->GlobalDepthOffsetScale) |
+      0;
+
+   dw[4] =
+      __gen_float(values->GlobalDepthOffsetClamp) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SAMPLER_PALETTE_LOAD0_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLER_PALETTE_LOAD0_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  2
+
+#define GEN8_3DSTATE_SAMPLER_PALETTE_LOAD0_length 0x00000000
+
+#define GEN8_PALETTE_ENTRY_length 0x00000001
+
+struct GEN8_PALETTE_ENTRY {
+   uint32_t                                     Alpha;
+   uint32_t                                     Red;
+   uint32_t                                     Green;
+   uint32_t                                     Blue;
+};
+
+static inline void
+GEN8_PALETTE_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN8_PALETTE_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->Alpha, 24, 31) |
+      __gen_field(values->Red, 16, 23) |
+      __gen_field(values->Green, 8, 15) |
+      __gen_field(values->Blue, 0, 7) |
+      0;
+
+}
+
+struct GEN8_3DSTATE_SAMPLER_PALETTE_LOAD0 {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLER_PALETTE_LOAD0_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN8_3DSTATE_SAMPLER_PALETTE_LOAD0 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_SAMPLER_PALETTE_LOAD1_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLER_PALETTE_LOAD1_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 12
+
+#define GEN8_3DSTATE_SAMPLER_PALETTE_LOAD1_length 0x00000000
+
+struct GEN8_3DSTATE_SAMPLER_PALETTE_LOAD1 {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLER_PALETTE_LOAD1_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN8_3DSTATE_SAMPLER_PALETTE_LOAD1 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 45,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS_length 0x00000002
+
+struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDSSamplerState;
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_GS_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 46,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_GS_length 0x00000002
+
+struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoGSSamplerState;
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLER_STATE_POINTERS_GS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoGSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 44,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS_length 0x00000002
+
+struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoHSSamplerState;
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoHSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_PS_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 47,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_PS_length 0x00000002
+
+struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoPSSamplerState;
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLER_STATE_POINTERS_PS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoPSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 43,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS_length 0x00000002
+
+struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoVSSamplerState;
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoVSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SAMPLE_MASK_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLE_MASK_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 24,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_SAMPLE_MASK_length 0x00000002
+
+struct GEN8_3DSTATE_SAMPLE_MASK {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SampleMask;
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLE_MASK_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_3DSTATE_SAMPLE_MASK * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SampleMask, 0, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SAMPLE_PATTERN_length_bias 0x00000002
+#define GEN8_3DSTATE_SAMPLE_PATTERN_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 28,                  \
+   .DwordLength          =  7
+
+#define GEN8_3DSTATE_SAMPLE_PATTERN_length 0x00000009
+
+struct GEN8_3DSTATE_SAMPLE_PATTERN {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        _8xSample7XOffset;
+   float                                        _8xSample7YOffset;
+   float                                        _8xSample6XOffset;
+   float                                        _8xSample6YOffset;
+   float                                        _8xSample5XOffset;
+   float                                        _8xSample5YOffset;
+   float                                        _8xSample4XOffset;
+   float                                        _8xSample4YOffset;
+   float                                        _8xSample3XOffset;
+   float                                        _8xSample3YOffset;
+   float                                        _8xSample2XOffset;
+   float                                        _8xSample2YOffset;
+   float                                        _8xSample1XOffset;
+   float                                        _8xSample1YOffset;
+   float                                        _8xSample0XOffset;
+   float                                        _8xSample0YOffset;
+   float                                        _4xSample3XOffset;
+   float                                        _4xSample3YOffset;
+   float                                        _4xSample2XOffset;
+   float                                        _4xSample2YOffset;
+   float                                        _4xSample1XOffset;
+   float                                        _4xSample1YOffset;
+   float                                        _4xSample0XOffset;
+   float                                        _4xSample0YOffset;
+   float                                        _1xSample0XOffset;
+   float                                        _1xSample0YOffset;
+   float                                        _2xSample1XOffset;
+   float                                        _2xSample1YOffset;
+   float                                        _2xSample0XOffset;
+   float                                        _2xSample0YOffset;
+};
+
+static inline void
+GEN8_3DSTATE_SAMPLE_PATTERN_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN8_3DSTATE_SAMPLE_PATTERN * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 4; i += 1, j++) {
+      dw[j] =
+         0;
+   }
+
+   dw[5] =
+      __gen_field(values->_8xSample7XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_8xSample7YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_8xSample6XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_8xSample6YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_8xSample5XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_8xSample5YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_8xSample4XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_8xSample4YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[6] =
+      __gen_field(values->_8xSample3XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_8xSample3YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_8xSample2XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_8xSample2YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_8xSample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_8xSample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_8xSample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_8xSample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[7] =
+      __gen_field(values->_4xSample3XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_4xSample3YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_4xSample2XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_4xSample2YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_4xSample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_4xSample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_4xSample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_4xSample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[8] =
+      __gen_field(values->_1xSample0XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_1xSample0YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_2xSample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_2xSample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_2xSample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_2xSample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SBE_length_bias 0x00000002
+#define GEN8_3DSTATE_SBE_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 31,                  \
+   .DwordLength          =  2
+
+#define GEN8_3DSTATE_SBE_length 0x00000004
+
+struct GEN8_3DSTATE_SBE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ForceVertexURBEntryReadLength;
+   bool                                         ForceVertexURBEntryReadOffset;
+   uint32_t                                     NumberofSFOutputAttributes;
+   bool                                         AttributeSwizzleEnable;
+#define     UPPERLEFT                                          0
+#define     LOWERLEFT                                          1
+   uint32_t                                     PointSpriteTextureCoordinateOrigin;
+   bool                                         PrimitiveIDOverrideComponentW;
+   bool                                         PrimitiveIDOverrideComponentZ;
+   bool                                         PrimitiveIDOverrideComponentY;
+   bool                                         PrimitiveIDOverrideComponentX;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     PrimitiveIDOverrideAttributeSelect;
+   uint32_t                                     PointSpriteTextureCoordinateEnable;
+   uint32_t                                     ConstantInterpolationEnable;
+};
+
+static inline void
+GEN8_3DSTATE_SBE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN8_3DSTATE_SBE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ForceVertexURBEntryReadLength, 29, 29) |
+      __gen_field(values->ForceVertexURBEntryReadOffset, 28, 28) |
+      __gen_field(values->NumberofSFOutputAttributes, 22, 27) |
+      __gen_field(values->AttributeSwizzleEnable, 21, 21) |
+      __gen_field(values->PointSpriteTextureCoordinateOrigin, 20, 20) |
+      __gen_field(values->PrimitiveIDOverrideComponentW, 19, 19) |
+      __gen_field(values->PrimitiveIDOverrideComponentZ, 18, 18) |
+      __gen_field(values->PrimitiveIDOverrideComponentY, 17, 17) |
+      __gen_field(values->PrimitiveIDOverrideComponentX, 16, 16) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 15) |
+      __gen_field(values->VertexURBEntryReadOffset, 5, 10) |
+      __gen_field(values->PrimitiveIDOverrideAttributeSelect, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_field(values->PointSpriteTextureCoordinateEnable, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ConstantInterpolationEnable, 0, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SBE_SWIZ_length_bias 0x00000002
+#define GEN8_3DSTATE_SBE_SWIZ_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 81,                  \
+   .DwordLength          =  9
+
+#define GEN8_3DSTATE_SBE_SWIZ_length 0x0000000b
+
+#define GEN8_SF_OUTPUT_ATTRIBUTE_DETAIL_length 0x00000001
+
+struct GEN8_SF_OUTPUT_ATTRIBUTE_DETAIL {
+   bool                                         ComponentOverrideW;
+   bool                                         ComponentOverrideZ;
+   bool                                         ComponentOverrideY;
+   bool                                         ComponentOverrideX;
+   uint32_t                                     SwizzleControlMode;
+#define     CONST_0000                                         0
+#define     CONST_0001_FLOAT                                   1
+#define     CONST_1111_FLOAT                                   2
+#define     PRIM_ID                                            3
+   uint32_t                                     ConstantSource;
+#define     INPUTATTR                                          0
+#define     INPUTATTR_FACING                                   1
+#define     INPUTATTR_W                                        2
+#define     INPUTATTR_FACING_W                                 3
+   uint32_t                                     SwizzleSelect;
+   uint32_t                                     SourceAttribute;
+};
+
+static inline void
+GEN8_SF_OUTPUT_ATTRIBUTE_DETAIL_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN8_SF_OUTPUT_ATTRIBUTE_DETAIL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ComponentOverrideW, 15, 15) |
+      __gen_field(values->ComponentOverrideZ, 14, 14) |
+      __gen_field(values->ComponentOverrideY, 13, 13) |
+      __gen_field(values->ComponentOverrideX, 12, 12) |
+      __gen_field(values->SwizzleControlMode, 11, 11) |
+      __gen_field(values->ConstantSource, 9, 10) |
+      __gen_field(values->SwizzleSelect, 6, 7) |
+      __gen_field(values->SourceAttribute, 0, 4) |
+      0;
+
+}
+
+struct GEN8_3DSTATE_SBE_SWIZ {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN8_SF_OUTPUT_ATTRIBUTE_DETAIL       Attribute[16];
+   uint32_t                                     AttributeWrapShortestEnables[16];
+};
+
+static inline void
+GEN8_3DSTATE_SBE_SWIZ_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_3DSTATE_SBE_SWIZ * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 16; i += 2, j++) {
+      uint32_t dw_Attribute0;
+      GEN8_SF_OUTPUT_ATTRIBUTE_DETAIL_pack(data, &dw_Attribute0, &values->Attribute[i + 0]);
+      uint32_t dw_Attribute1;
+      GEN8_SF_OUTPUT_ATTRIBUTE_DETAIL_pack(data, &dw_Attribute1, &values->Attribute[i + 1]);
+      dw[j] =
+         __gen_field(dw_Attribute0, 0, 15) |
+         __gen_field(dw_Attribute1, 16, 31) |
+         0;
+   }
+
+   for (uint32_t i = 0, j = 9; i < 16; i += 8, j++) {
+      dw[j] =
+         __gen_field(values->AttributeWrapShortestEnables[i + 0], 0, 3) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 1], 4, 7) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 2], 8, 11) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 3], 12, 15) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 4], 16, 19) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 5], 20, 23) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 6], 24, 27) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 7], 28, 31) |
+         0;
+   }
+
+}
+
+#define GEN8_3DSTATE_SCISSOR_STATE_POINTERS_length_bias 0x00000002
+#define GEN8_3DSTATE_SCISSOR_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 15,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_SCISSOR_STATE_POINTERS_length 0x00000002
+
+struct GEN8_3DSTATE_SCISSOR_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ScissorRectPointer;
+};
+
+static inline void
+GEN8_3DSTATE_SCISSOR_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN8_3DSTATE_SCISSOR_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ScissorRectPointer, 5, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SF_length_bias 0x00000002
+#define GEN8_3DSTATE_SF_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 19,                  \
+   .DwordLength          =  2
+
+#define GEN8_3DSTATE_SF_length 0x00000004
+
+struct GEN8_3DSTATE_SF {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         LegacyGlobalDepthBiasEnable;
+   bool                                         StatisticsEnable;
+   bool                                         ViewportTransformEnable;
+   float                                        LineWidth;
+#define     _05pixels                                          0
+#define     _10pixels                                          1
+#define     _20pixels                                          2
+#define     _40pixels                                          3
+   uint32_t                                     LineEndCapAntialiasingRegionWidth;
+   bool                                         LastPixelEnable;
+   uint32_t                                     TriangleStripListProvokingVertexSelect;
+   uint32_t                                     LineStripListProvokingVertexSelect;
+   uint32_t                                     TriangleFanProvokingVertexSelect;
+#define     AALINEDISTANCE_TRUE                                1
+   uint32_t                                     AALineDistanceMode;
+   bool                                         SmoothPointEnable;
+   uint32_t                                     VertexSubPixelPrecisionSelect;
+#define     Vertex                                             0
+#define     State                                              1
+   uint32_t                                     PointWidthSource;
+   float                                        PointWidth;
+};
+
+static inline void
+GEN8_3DSTATE_SF_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_SF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->LegacyGlobalDepthBiasEnable, 11, 11) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->ViewportTransformEnable, 1, 1) |
+      0;
+
+   dw[2] =
+      __gen_field(values->LineWidth * (1 << 7), 18, 27) |
+      __gen_field(values->LineEndCapAntialiasingRegionWidth, 16, 17) |
+      0;
+
+   dw[3] =
+      __gen_field(values->LastPixelEnable, 31, 31) |
+      __gen_field(values->TriangleStripListProvokingVertexSelect, 29, 30) |
+      __gen_field(values->LineStripListProvokingVertexSelect, 27, 28) |
+      __gen_field(values->TriangleFanProvokingVertexSelect, 25, 26) |
+      __gen_field(values->AALineDistanceMode, 14, 14) |
+      __gen_field(values->SmoothPointEnable, 13, 13) |
+      __gen_field(values->VertexSubPixelPrecisionSelect, 12, 12) |
+      __gen_field(values->PointWidthSource, 11, 11) |
+      __gen_field(values->PointWidth * (1 << 3), 0, 10) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SO_BUFFER_length_bias 0x00000002
+#define GEN8_3DSTATE_SO_BUFFER_header           \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 24,                  \
+   .DwordLength          =  6
+
+#define GEN8_3DSTATE_SO_BUFFER_length 0x00000008
+
+struct GEN8_3DSTATE_SO_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         SOBufferEnable;
+   uint32_t                                     SOBufferIndex;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      SOBufferObjectControlState;
+   bool                                         StreamOffsetWriteEnable;
+   bool                                         StreamOutputBufferOffsetAddressEnable;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     SurfaceSize;
+   __gen_address_type                           StreamOutputBufferOffsetAddress;
+   uint32_t                                     StreamOffset;
+};
+
+static inline void
+GEN8_3DSTATE_SO_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN8_3DSTATE_SO_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SOBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SOBufferObjectControlState, &values->SOBufferObjectControlState);
+   dw[1] =
+      __gen_field(values->SOBufferEnable, 31, 31) |
+      __gen_field(values->SOBufferIndex, 29, 30) |
+      __gen_field(dw_SOBufferObjectControlState, 22, 28) |
+      __gen_field(values->StreamOffsetWriteEnable, 21, 21) |
+      __gen_field(values->StreamOutputBufferOffsetAddressEnable, 20, 20) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->SurfaceSize, 0, 29) |
+      0;
+
+   uint32_t dw5 =
+      0;
+
+   uint64_t qw5 =
+      __gen_combine_address(data, &dw[5], values->StreamOutputBufferOffsetAddress, dw5);
+
+   dw[5] = qw5;
+   dw[6] = qw5 >> 32;
+
+   dw[7] =
+      __gen_field(values->StreamOffset, 0, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_SO_DECL_LIST_length_bias 0x00000002
+#define GEN8_3DSTATE_SO_DECL_LIST_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 23
+
+#define GEN8_3DSTATE_SO_DECL_LIST_length 0x00000000
+
+#define GEN8_SO_DECL_ENTRY_length 0x00000002
+
+#define GEN8_SO_DECL_length 0x00000001
+
+struct GEN8_SO_DECL {
+   uint32_t                                     OutputBufferSlot;
+   uint32_t                                     HoleFlag;
+   uint32_t                                     RegisterIndex;
+   uint32_t                                     ComponentMask;
+};
+
+static inline void
+GEN8_SO_DECL_pack(__gen_user_data *data, void * restrict dst,
+                  const struct GEN8_SO_DECL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->OutputBufferSlot, 12, 13) |
+      __gen_field(values->HoleFlag, 11, 11) |
+      __gen_field(values->RegisterIndex, 4, 9) |
+      __gen_field(values->ComponentMask, 0, 3) |
+      0;
+
+}
+
+struct GEN8_SO_DECL_ENTRY {
+   struct GEN8_SO_DECL                          Stream3Decl;
+   struct GEN8_SO_DECL                          Stream2Decl;
+   struct GEN8_SO_DECL                          Stream1Decl;
+   struct GEN8_SO_DECL                          Stream0Decl;
+};
+
+static inline void
+GEN8_SO_DECL_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN8_SO_DECL_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_Stream3Decl;
+   GEN8_SO_DECL_pack(data, &dw_Stream3Decl, &values->Stream3Decl);
+   uint32_t dw_Stream2Decl;
+   GEN8_SO_DECL_pack(data, &dw_Stream2Decl, &values->Stream2Decl);
+   uint32_t dw_Stream1Decl;
+   GEN8_SO_DECL_pack(data, &dw_Stream1Decl, &values->Stream1Decl);
+   uint32_t dw_Stream0Decl;
+   GEN8_SO_DECL_pack(data, &dw_Stream0Decl, &values->Stream0Decl);
+   uint64_t qw0 =
+      __gen_field(dw_Stream3Decl, 48, 63) |
+      __gen_field(dw_Stream2Decl, 32, 47) |
+      __gen_field(dw_Stream1Decl, 16, 31) |
+      __gen_field(dw_Stream0Decl, 0, 15) |
+      0;
+
+   dw[0] = qw0;
+   dw[1] = qw0 >> 32;
+
+}
+
+struct GEN8_3DSTATE_SO_DECL_LIST {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StreamtoBufferSelects3;
+   uint32_t                                     StreamtoBufferSelects2;
+   uint32_t                                     StreamtoBufferSelects1;
+   uint32_t                                     StreamtoBufferSelects0;
+   uint32_t                                     NumEntries3;
+   uint32_t                                     NumEntries2;
+   uint32_t                                     NumEntries1;
+   uint32_t                                     NumEntries0;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_SO_DECL_LIST_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_3DSTATE_SO_DECL_LIST * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StreamtoBufferSelects3, 12, 15) |
+      __gen_field(values->StreamtoBufferSelects2, 8, 11) |
+      __gen_field(values->StreamtoBufferSelects1, 4, 7) |
+      __gen_field(values->StreamtoBufferSelects0, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->NumEntries3, 24, 31) |
+      __gen_field(values->NumEntries2, 16, 23) |
+      __gen_field(values->NumEntries1, 8, 15) |
+      __gen_field(values->NumEntries0, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_STENCIL_BUFFER_length_bias 0x00000002
+#define GEN8_3DSTATE_STENCIL_BUFFER_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  6,                  \
+   .DwordLength          =  3
+
+#define GEN8_3DSTATE_STENCIL_BUFFER_length 0x00000005
+
+struct GEN8_3DSTATE_STENCIL_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StencilBufferEnable;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      StencilBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     SurfaceQPitch;
+};
+
+static inline void
+GEN8_3DSTATE_STENCIL_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN8_3DSTATE_STENCIL_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_StencilBufferObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_StencilBufferObjectControlState, &values->StencilBufferObjectControlState);
+   dw[1] =
+      __gen_field(values->StencilBufferEnable, 31, 31) |
+      __gen_field(dw_StencilBufferObjectControlState, 22, 28) |
+      __gen_field(values->SurfacePitch, 0, 16) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->SurfaceQPitch, 0, 14) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_STREAMOUT_length_bias 0x00000002
+#define GEN8_3DSTATE_STREAMOUT_header           \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 30,                  \
+   .DwordLength          =  3
+
+#define GEN8_3DSTATE_STREAMOUT_length 0x00000005
+
+struct GEN8_3DSTATE_STREAMOUT {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SOFunctionEnable;
+   uint32_t                                     APIRenderingDisable;
+   uint32_t                                     RenderStreamSelect;
+#define     LEADING                                            0
+#define     TRAILING                                           1
+   uint32_t                                     ReorderMode;
+   bool                                         SOStatisticsEnable;
+#define     Normal                                             0
+#define     Resreved                                           1
+#define     Force_Off                                          2
+#define     Force_on                                           3
+   uint32_t                                     ForceRendering;
+   uint32_t                                     Stream3VertexReadOffset;
+   uint32_t                                     Stream3VertexReadLength;
+   uint32_t                                     Stream2VertexReadOffset;
+   uint32_t                                     Stream2VertexReadLength;
+   uint32_t                                     Stream1VertexReadOffset;
+   uint32_t                                     Stream1VertexReadLength;
+   uint32_t                                     Stream0VertexReadOffset;
+   uint32_t                                     Stream0VertexReadLength;
+   uint32_t                                     Buffer1SurfacePitch;
+   uint32_t                                     Buffer0SurfacePitch;
+   uint32_t                                     Buffer3SurfacePitch;
+   uint32_t                                     Buffer2SurfacePitch;
+};
+
+static inline void
+GEN8_3DSTATE_STREAMOUT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN8_3DSTATE_STREAMOUT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SOFunctionEnable, 31, 31) |
+      __gen_field(values->APIRenderingDisable, 30, 30) |
+      __gen_field(values->RenderStreamSelect, 27, 28) |
+      __gen_field(values->ReorderMode, 26, 26) |
+      __gen_field(values->SOStatisticsEnable, 25, 25) |
+      __gen_field(values->ForceRendering, 23, 24) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Stream3VertexReadOffset, 29, 29) |
+      __gen_field(values->Stream3VertexReadLength, 24, 28) |
+      __gen_field(values->Stream2VertexReadOffset, 21, 21) |
+      __gen_field(values->Stream2VertexReadLength, 16, 20) |
+      __gen_field(values->Stream1VertexReadOffset, 13, 13) |
+      __gen_field(values->Stream1VertexReadLength, 8, 12) |
+      __gen_field(values->Stream0VertexReadOffset, 5, 5) |
+      __gen_field(values->Stream0VertexReadLength, 0, 4) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Buffer1SurfacePitch, 16, 27) |
+      __gen_field(values->Buffer0SurfacePitch, 0, 11) |
+      0;
+
+   dw[4] =
+      __gen_field(values->Buffer3SurfacePitch, 16, 27) |
+      __gen_field(values->Buffer2SurfacePitch, 0, 11) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_TE_length_bias 0x00000002
+#define GEN8_3DSTATE_TE_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 28,                  \
+   .DwordLength          =  2
+
+#define GEN8_3DSTATE_TE_length 0x00000004
+
+struct GEN8_3DSTATE_TE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     INTEGER                                            0
+#define     ODD_FRACTIONAL                                     1
+#define     EVEN_FRACTIONAL                                    2
+   uint32_t                                     Partitioning;
+#define     POINT                                              0
+#define     OUTPUT_LINE                                        1
+#define     OUTPUT_TRI_CW                                      2
+#define     OUTPUT_TRI_CCW                                     3
+   uint32_t                                     OutputTopology;
+#define     QUAD                                               0
+#define     TRI                                                1
+#define     ISOLINE                                            2
+   uint32_t                                     TEDomain;
+#define     HW_TESS                                            0
+#define     SW_TESS                                            1
+   uint32_t                                     TEMode;
+   bool                                         TEEnable;
+   float                                        MaximumTessellationFactorOdd;
+   float                                        MaximumTessellationFactorNotOdd;
+};
+
+static inline void
+GEN8_3DSTATE_TE_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_TE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Partitioning, 12, 13) |
+      __gen_field(values->OutputTopology, 8, 9) |
+      __gen_field(values->TEDomain, 4, 5) |
+      __gen_field(values->TEMode, 1, 2) |
+      __gen_field(values->TEEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_float(values->MaximumTessellationFactorOdd) |
+      0;
+
+   dw[3] =
+      __gen_float(values->MaximumTessellationFactorNotOdd) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_URB_DS_length_bias 0x00000002
+#define GEN8_3DSTATE_URB_DS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 50,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_URB_DS_length 0x00000002
+
+struct GEN8_3DSTATE_URB_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     DSURBStartingAddress;
+   uint32_t                                     DSURBEntryAllocationSize;
+   uint32_t                                     DSNumberofURBEntries;
+};
+
+static inline void
+GEN8_3DSTATE_URB_DS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN8_3DSTATE_URB_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DSURBStartingAddress, 25, 31) |
+      __gen_field(values->DSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->DSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_URB_GS_length_bias 0x00000002
+#define GEN8_3DSTATE_URB_GS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 51,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_URB_GS_length 0x00000002
+
+struct GEN8_3DSTATE_URB_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     GSURBStartingAddress;
+   uint32_t                                     GSURBEntryAllocationSize;
+   uint32_t                                     GSNumberofURBEntries;
+};
+
+static inline void
+GEN8_3DSTATE_URB_GS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN8_3DSTATE_URB_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->GSURBStartingAddress, 25, 31) |
+      __gen_field(values->GSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->GSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_URB_HS_length_bias 0x00000002
+#define GEN8_3DSTATE_URB_HS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 49,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_URB_HS_length 0x00000002
+
+struct GEN8_3DSTATE_URB_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     HSURBStartingAddress;
+   uint32_t                                     HSURBEntryAllocationSize;
+   uint32_t                                     HSNumberofURBEntries;
+};
+
+static inline void
+GEN8_3DSTATE_URB_HS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN8_3DSTATE_URB_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->HSURBStartingAddress, 25, 31) |
+      __gen_field(values->HSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->HSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_VERTEX_BUFFERS_length_bias 0x00000002
+#define GEN8_3DSTATE_VERTEX_BUFFERS_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  8
+
+#define GEN8_3DSTATE_VERTEX_BUFFERS_length 0x00000000
+
+#define GEN8_VERTEX_BUFFER_STATE_length 0x00000004
+
+struct GEN8_VERTEX_BUFFER_STATE {
+   uint32_t                                     VertexBufferIndex;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   uint32_t                                     AddressModifyEnable;
+   bool                                         NullVertexBuffer;
+   uint32_t                                     BufferPitch;
+   __gen_address_type                           BufferStartingAddress;
+   uint32_t                                     BufferSize;
+};
+
+static inline void
+GEN8_VERTEX_BUFFER_STATE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_VERTEX_BUFFER_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   dw[0] =
+      __gen_field(values->VertexBufferIndex, 26, 31) |
+      __gen_field(dw_MemoryObjectControlState, 16, 22) |
+      __gen_field(values->AddressModifyEnable, 14, 14) |
+      __gen_field(values->NullVertexBuffer, 13, 13) |
+      __gen_field(values->BufferPitch, 0, 11) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->BufferStartingAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->BufferSize, 0, 31) |
+      0;
+
+}
+
+struct GEN8_3DSTATE_VERTEX_BUFFERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_VERTEX_BUFFERS_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN8_3DSTATE_VERTEX_BUFFERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_VERTEX_ELEMENTS_length_bias 0x00000002
+#define GEN8_3DSTATE_VERTEX_ELEMENTS_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  9
+
+#define GEN8_3DSTATE_VERTEX_ELEMENTS_length 0x00000000
+
+#define GEN8_VERTEX_ELEMENT_STATE_length 0x00000002
+
+struct GEN8_VERTEX_ELEMENT_STATE {
+   uint32_t                                     VertexBufferIndex;
+   bool                                         Valid;
+   uint32_t                                     SourceElementFormat;
+   bool                                         EdgeFlagEnable;
+   uint32_t                                     SourceElementOffset;
+   uint32_t                                     Component0Control;
+   uint32_t                                     Component1Control;
+   uint32_t                                     Component2Control;
+   uint32_t                                     Component3Control;
+};
+
+static inline void
+GEN8_VERTEX_ELEMENT_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_VERTEX_ELEMENT_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->VertexBufferIndex, 26, 31) |
+      __gen_field(values->Valid, 25, 25) |
+      __gen_field(values->SourceElementFormat, 16, 24) |
+      __gen_field(values->EdgeFlagEnable, 15, 15) |
+      __gen_field(values->SourceElementOffset, 0, 11) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Component0Control, 28, 30) |
+      __gen_field(values->Component1Control, 24, 26) |
+      __gen_field(values->Component2Control, 20, 22) |
+      __gen_field(values->Component3Control, 16, 18) |
+      0;
+
+}
+
+struct GEN8_3DSTATE_VERTEX_ELEMENTS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_3DSTATE_VERTEX_ELEMENTS_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN8_3DSTATE_VERTEX_ELEMENTS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_3DSTATE_VF_length_bias 0x00000002
+#define GEN8_3DSTATE_VF_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 12,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_VF_length 0x00000002
+
+struct GEN8_3DSTATE_VF {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         IndexedDrawCutIndexEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CutIndex;
+};
+
+static inline void
+GEN8_3DSTATE_VF_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_VF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->IndexedDrawCutIndexEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->CutIndex, 0, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_VF_INSTANCING_length_bias 0x00000002
+#define GEN8_3DSTATE_VF_INSTANCING_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 73,                  \
+   .DwordLength          =  1
+
+#define GEN8_3DSTATE_VF_INSTANCING_length 0x00000003
+
+struct GEN8_3DSTATE_VF_INSTANCING {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         InstancingEnable;
+   uint32_t                                     VertexElementIndex;
+   uint32_t                                     InstanceDataStepRate;
+};
+
+static inline void
+GEN8_3DSTATE_VF_INSTANCING_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN8_3DSTATE_VF_INSTANCING * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InstancingEnable, 8, 8) |
+      __gen_field(values->VertexElementIndex, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->InstanceDataStepRate, 0, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_VF_SGVS_length_bias 0x00000002
+#define GEN8_3DSTATE_VF_SGVS_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 74,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_VF_SGVS_length 0x00000002
+
+struct GEN8_3DSTATE_VF_SGVS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         InstanceIDEnable;
+#define     COMP_0                                             0
+#define     COMP_1                                             1
+#define     COMP_2                                             2
+#define     COMP_3                                             3
+   uint32_t                                     InstanceIDComponentNumber;
+   uint32_t                                     InstanceIDElementOffset;
+   bool                                         VertexIDEnable;
+#define     COMP_0                                             0
+#define     COMP_1                                             1
+#define     COMP_2                                             2
+#define     COMP_3                                             3
+   uint32_t                                     VertexIDComponentNumber;
+   uint32_t                                     VertexIDElementOffset;
+};
+
+static inline void
+GEN8_3DSTATE_VF_SGVS_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN8_3DSTATE_VF_SGVS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InstanceIDEnable, 31, 31) |
+      __gen_field(values->InstanceIDComponentNumber, 29, 30) |
+      __gen_field(values->InstanceIDElementOffset, 16, 21) |
+      __gen_field(values->VertexIDEnable, 15, 15) |
+      __gen_field(values->VertexIDComponentNumber, 13, 14) |
+      __gen_field(values->VertexIDElementOffset, 0, 5) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_VF_STATISTICS_length_bias 0x00000001
+#define GEN8_3DSTATE_VF_STATISTICS_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  1,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 11
+
+#define GEN8_3DSTATE_VF_STATISTICS_length 0x00000001
+
+struct GEN8_3DSTATE_VF_STATISTICS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         StatisticsEnable;
+};
+
+static inline void
+GEN8_3DSTATE_VF_STATISTICS_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN8_3DSTATE_VF_STATISTICS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->StatisticsEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_VF_TOPOLOGY_length_bias 0x00000002
+#define GEN8_3DSTATE_VF_TOPOLOGY_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 75,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_VF_TOPOLOGY_length 0x00000002
+
+struct GEN8_3DSTATE_VF_TOPOLOGY {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PrimitiveTopologyType;
+};
+
+static inline void
+GEN8_3DSTATE_VF_TOPOLOGY_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_3DSTATE_VF_TOPOLOGY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PrimitiveTopologyType, 0, 5) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC_length_bias 0x00000002
+#define GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 35,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC_length 0x00000002
+
+struct GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CCViewportPointer;
+};
+
+static inline void
+GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->CCViewportPointer, 5, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_length_bias 0x00000002
+#define GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 33,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_length 0x00000002
+
+struct GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SFClipViewportPointer;
+};
+
+static inline void
+GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_pack(__gen_user_data *data, void * restrict dst,
+                                                  const struct GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SFClipViewportPointer, 6, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_WM_length_bias 0x00000002
+#define GEN8_3DSTATE_WM_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 20,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_WM_length 0x00000002
+
+struct GEN8_3DSTATE_WM {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         StatisticsEnable;
+   bool                                         LegacyDepthBufferClearEnable;
+   bool                                         LegacyDepthBufferResolveEnable;
+   bool                                         LegacyHierarchicalDepthBufferResolveEnable;
+   bool                                         LegacyDiamondLineRasterization;
+#define     NORMAL                                             0
+#define     PSEXEC                                             1
+#define     PREPS                                              2
+   uint32_t                                     EarlyDepthStencilControl;
+#define     Normal                                             0
+#define     ForceOff                                           1
+#define     ForceON                                            2
+   uint32_t                                     ForceThreadDispatchEnable;
+#define     INTERP_PIXEL                                       0
+#define     INTERP_CENTROID                                    2
+#define     INTERP_SAMPLE                                      3
+   uint32_t                                     PositionZWInterpolationMode;
+   uint32_t                                     BarycentricInterpolationMode;
+#define     _05pixels                                          0
+#define     _10pixels                                          1
+#define     _20pixels                                          2
+#define     _40pixels                                          3
+   uint32_t                                     LineEndCapAntialiasingRegionWidth;
+#define     _05pixels                                          0
+#define     _10pixels                                          1
+#define     _20pixels                                          2
+#define     _40pixels                                          3
+   uint32_t                                     LineAntialiasingRegionWidth;
+   bool                                         PolygonStippleEnable;
+   bool                                         LineStippleEnable;
+#define     RASTRULE_UPPER_LEFT                                0
+#define     RASTRULE_UPPER_RIGHT                               1
+   uint32_t                                     PointRasterizationRule;
+#define     Normal                                             0
+#define     ForceOff                                           1
+#define     ForceON                                            2
+   uint32_t                                     ForceKillPixelEnable;
+};
+
+static inline void
+GEN8_3DSTATE_WM_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_3DSTATE_WM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StatisticsEnable, 31, 31) |
+      __gen_field(values->LegacyDepthBufferClearEnable, 30, 30) |
+      __gen_field(values->LegacyDepthBufferResolveEnable, 28, 28) |
+      __gen_field(values->LegacyHierarchicalDepthBufferResolveEnable, 27, 27) |
+      __gen_field(values->LegacyDiamondLineRasterization, 26, 26) |
+      __gen_field(values->EarlyDepthStencilControl, 21, 22) |
+      __gen_field(values->ForceThreadDispatchEnable, 19, 20) |
+      __gen_field(values->PositionZWInterpolationMode, 17, 18) |
+      __gen_field(values->BarycentricInterpolationMode, 11, 16) |
+      __gen_field(values->LineEndCapAntialiasingRegionWidth, 8, 9) |
+      __gen_field(values->LineAntialiasingRegionWidth, 6, 7) |
+      __gen_field(values->PolygonStippleEnable, 4, 4) |
+      __gen_field(values->LineStippleEnable, 3, 3) |
+      __gen_field(values->PointRasterizationRule, 2, 2) |
+      __gen_field(values->ForceKillPixelEnable, 0, 1) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_WM_CHROMAKEY_length_bias 0x00000002
+#define GEN8_3DSTATE_WM_CHROMAKEY_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 76,                  \
+   .DwordLength          =  0
+
+#define GEN8_3DSTATE_WM_CHROMAKEY_length 0x00000002
+
+struct GEN8_3DSTATE_WM_CHROMAKEY {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ChromaKeyKillEnable;
+};
+
+static inline void
+GEN8_3DSTATE_WM_CHROMAKEY_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_3DSTATE_WM_CHROMAKEY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ChromaKeyKillEnable, 31, 31) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_WM_DEPTH_STENCIL_length_bias 0x00000002
+#define GEN8_3DSTATE_WM_DEPTH_STENCIL_header    \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 78,                  \
+   .DwordLength          =  1
+
+#define GEN8_3DSTATE_WM_DEPTH_STENCIL_length 0x00000003
+
+struct GEN8_3DSTATE_WM_DEPTH_STENCIL {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StencilFailOp;
+   uint32_t                                     StencilPassDepthFailOp;
+   uint32_t                                     StencilPassDepthPassOp;
+   uint32_t                                     BackfaceStencilTestFunction;
+   uint32_t                                     BackfaceStencilFailOp;
+   uint32_t                                     BackfaceStencilPassDepthFailOp;
+   uint32_t                                     BackfaceStencilPassDepthPassOp;
+   uint32_t                                     StencilTestFunction;
+   uint32_t                                     DepthTestFunction;
+   bool                                         DoubleSidedStencilEnable;
+   bool                                         StencilTestEnable;
+   bool                                         StencilBufferWriteEnable;
+   bool                                         DepthTestEnable;
+   bool                                         DepthBufferWriteEnable;
+   uint32_t                                     StencilTestMask;
+   uint32_t                                     StencilWriteMask;
+   uint32_t                                     BackfaceStencilTestMask;
+   uint32_t                                     BackfaceStencilWriteMask;
+};
+
+static inline void
+GEN8_3DSTATE_WM_DEPTH_STENCIL_pack(__gen_user_data *data, void * restrict dst,
+                                   const struct GEN8_3DSTATE_WM_DEPTH_STENCIL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StencilFailOp, 29, 31) |
+      __gen_field(values->StencilPassDepthFailOp, 26, 28) |
+      __gen_field(values->StencilPassDepthPassOp, 23, 25) |
+      __gen_field(values->BackfaceStencilTestFunction, 20, 22) |
+      __gen_field(values->BackfaceStencilFailOp, 17, 19) |
+      __gen_field(values->BackfaceStencilPassDepthFailOp, 14, 16) |
+      __gen_field(values->BackfaceStencilPassDepthPassOp, 11, 13) |
+      __gen_field(values->StencilTestFunction, 8, 10) |
+      __gen_field(values->DepthTestFunction, 5, 7) |
+      __gen_field(values->DoubleSidedStencilEnable, 4, 4) |
+      __gen_field(values->StencilTestEnable, 3, 3) |
+      __gen_field(values->StencilBufferWriteEnable, 2, 2) |
+      __gen_field(values->DepthTestEnable, 1, 1) |
+      __gen_field(values->DepthBufferWriteEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_field(values->StencilTestMask, 24, 31) |
+      __gen_field(values->StencilWriteMask, 16, 23) |
+      __gen_field(values->BackfaceStencilTestMask, 8, 15) |
+      __gen_field(values->BackfaceStencilWriteMask, 0, 7) |
+      0;
+
+}
+
+#define GEN8_3DSTATE_WM_HZ_OP_length_bias 0x00000002
+#define GEN8_3DSTATE_WM_HZ_OP_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 82,                  \
+   .DwordLength          =  3
+
+#define GEN8_3DSTATE_WM_HZ_OP_length 0x00000005
+
+struct GEN8_3DSTATE_WM_HZ_OP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         StencilBufferClearEnable;
+   bool                                         DepthBufferClearEnable;
+   bool                                         ScissorRectangleEnable;
+   bool                                         DepthBufferResolveEnable;
+   bool                                         HierarchicalDepthBufferResolveEnable;
+   uint32_t                                     PixelPositionOffsetEnable;
+   bool                                         FullSurfaceDepthClear;
+   uint32_t                                     StencilClearValue;
+   uint32_t                                     NumberofMultisamples;
+   uint32_t                                     ClearRectangleYMin;
+   uint32_t                                     ClearRectangleXMin;
+   uint32_t                                     ClearRectangleYMax;
+   uint32_t                                     ClearRectangleXMax;
+   uint32_t                                     SampleMask;
+};
+
+static inline void
+GEN8_3DSTATE_WM_HZ_OP_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_3DSTATE_WM_HZ_OP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StencilBufferClearEnable, 31, 31) |
+      __gen_field(values->DepthBufferClearEnable, 30, 30) |
+      __gen_field(values->ScissorRectangleEnable, 29, 29) |
+      __gen_field(values->DepthBufferResolveEnable, 28, 28) |
+      __gen_field(values->HierarchicalDepthBufferResolveEnable, 27, 27) |
+      __gen_field(values->PixelPositionOffsetEnable, 26, 26) |
+      __gen_field(values->FullSurfaceDepthClear, 25, 25) |
+      __gen_field(values->StencilClearValue, 16, 23) |
+      __gen_field(values->NumberofMultisamples, 13, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClearRectangleYMin, 16, 31) |
+      __gen_field(values->ClearRectangleXMin, 0, 15) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ClearRectangleYMax, 16, 31) |
+      __gen_field(values->ClearRectangleXMax, 0, 15) |
+      0;
+
+   dw[4] =
+      __gen_field(values->SampleMask, 0, 15) |
+      0;
+
+}
+
+#define GEN8_GPGPU_WALKER_length_bias 0x00000002
+#define GEN8_GPGPU_WALKER_header                \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  5,                  \
+   .DwordLength          = 13
+
+#define GEN8_GPGPU_WALKER_length 0x0000000f
+
+struct GEN8_GPGPU_WALKER {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   bool                                         IndirectParameterEnable;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   uint32_t                                     IndirectDataLength;
+   uint32_t                                     IndirectDataStartAddress;
+#define     SIMD8                                              0
+#define     SIMD16                                             1
+#define     SIMD32                                             2
+   uint32_t                                     SIMDSize;
+   uint32_t                                     ThreadDepthCounterMaximum;
+   uint32_t                                     ThreadHeightCounterMaximum;
+   uint32_t                                     ThreadWidthCounterMaximum;
+   uint32_t                                     ThreadGroupIDStartingX;
+   uint32_t                                     ThreadGroupIDXDimension;
+   uint32_t                                     ThreadGroupIDStartingY;
+   uint32_t                                     ThreadGroupIDYDimension;
+   uint32_t                                     ThreadGroupIDStartingResumeZ;
+   uint32_t                                     ThreadGroupIDZDimension;
+   uint32_t                                     RightExecutionMask;
+   uint32_t                                     BottomExecutionMask;
+};
+
+static inline void
+GEN8_GPGPU_WALKER_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN8_GPGPU_WALKER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->IndirectParameterEnable, 10, 10) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->IndirectDataStartAddress, 6, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->SIMDSize, 30, 31) |
+      __gen_field(values->ThreadDepthCounterMaximum, 16, 21) |
+      __gen_field(values->ThreadHeightCounterMaximum, 8, 13) |
+      __gen_field(values->ThreadWidthCounterMaximum, 0, 5) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ThreadGroupIDStartingX, 0, 31) |
+      0;
+
+   dw[6] =
+      0;
+
+   dw[7] =
+      __gen_field(values->ThreadGroupIDXDimension, 0, 31) |
+      0;
+
+   dw[8] =
+      __gen_field(values->ThreadGroupIDStartingY, 0, 31) |
+      0;
+
+   dw[9] =
+      0;
+
+   dw[10] =
+      __gen_field(values->ThreadGroupIDYDimension, 0, 31) |
+      0;
+
+   dw[11] =
+      __gen_field(values->ThreadGroupIDStartingResumeZ, 0, 31) |
+      0;
+
+   dw[12] =
+      __gen_field(values->ThreadGroupIDZDimension, 0, 31) |
+      0;
+
+   dw[13] =
+      __gen_field(values->RightExecutionMask, 0, 31) |
+      0;
+
+   dw[14] =
+      __gen_field(values->BottomExecutionMask, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MEDIA_CURBE_LOAD_length_bias 0x00000002
+#define GEN8_MEDIA_CURBE_LOAD_header            \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  1,                  \
+   .DwordLength          =  2
+
+#define GEN8_MEDIA_CURBE_LOAD_length 0x00000004
+
+struct GEN8_MEDIA_CURBE_LOAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CURBETotalDataLength;
+   uint32_t                                     CURBEDataStartAddress;
+};
+
+static inline void
+GEN8_MEDIA_CURBE_LOAD_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_MEDIA_CURBE_LOAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->CURBETotalDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_field(values->CURBEDataStartAddress, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD_length_bias 0x00000002
+#define GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD_header\
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  2,                  \
+   .DwordLength          =  2
+
+#define GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD_length 0x00000004
+
+struct GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorTotalLength;
+   uint32_t                                     InterfaceDescriptorDataStartAddress;
+};
+
+static inline void
+GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->InterfaceDescriptorTotalLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->InterfaceDescriptorDataStartAddress, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MEDIA_OBJECT_length_bias 0x00000002
+#define GEN8_MEDIA_OBJECT_header                \
+   .CommandType          =  3,                  \
+   .MediaCommandPipeline =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .MediaCommandSubOpcode =  0
+
+#define GEN8_MEDIA_OBJECT_length 0x00000000
+
+struct GEN8_MEDIA_OBJECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MediaCommandPipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     MediaCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+#define     Nothreadsynchronization                            0
+#define     Threaddispatchissynchronizedbythespawnrootthreadmessage       1
+   uint32_t                                     ThreadSynchronization;
+   uint32_t                                     ForceDestination;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+#define     Slice0                                             0
+#define     Slice1                                             1
+#define     Slice2                                             2
+   uint32_t                                     SliceDestinationSelect;
+#define     SubSlice2                                          2
+#define     SubSlice1                                          1
+#define     SubSlice0                                          0
+   uint32_t                                     SubSliceDestinationSelect;
+   uint32_t                                     IndirectDataLength;
+   __gen_address_type                           IndirectDataStartAddress;
+   uint32_t                                     ScoredboardY;
+   uint32_t                                     ScoreboardX;
+   uint32_t                                     ScoreboardColor;
+   bool                                         ScoreboardMask;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_MEDIA_OBJECT_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN8_MEDIA_OBJECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MediaCommandPipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->MediaCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->ThreadSynchronization, 24, 24) |
+      __gen_field(values->ForceDestination, 22, 22) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->SliceDestinationSelect, 19, 20) |
+      __gen_field(values->SubSliceDestinationSelect, 17, 18) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->IndirectDataStartAddress, dw3);
+
+   dw[4] =
+      __gen_field(values->ScoredboardY, 16, 24) |
+      __gen_field(values->ScoreboardX, 0, 8) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardColor, 16, 19) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_MEDIA_OBJECT_GRPID_length_bias 0x00000002
+#define GEN8_MEDIA_OBJECT_GRPID_header          \
+   .CommandType          =  3,                  \
+   .MediaCommandPipeline =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .MediaCommandSubOpcode =  6
+
+#define GEN8_MEDIA_OBJECT_GRPID_length 0x00000000
+
+struct GEN8_MEDIA_OBJECT_GRPID {
+   uint32_t                                     CommandType;
+   uint32_t                                     MediaCommandPipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     MediaCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   uint32_t                                     EndofThreadGroup;
+   uint32_t                                     ForceDestination;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+#define     Slice0                                             0
+#define     Slice1                                             1
+#define     Slice2                                             2
+   uint32_t                                     SliceDestinationSelect;
+#define     SubSlice2                                          2
+#define     SubSlice1                                          1
+#define     SubSlice0                                          0
+   uint32_t                                     SubSliceDestinationSelect;
+   uint32_t                                     IndirectDataLength;
+   __gen_address_type                           IndirectDataStartAddress;
+   uint32_t                                     ScoreboardY;
+   uint32_t                                     ScoreboardX;
+   uint32_t                                     ScoreboardColor;
+   bool                                         ScoreboardMask;
+   uint32_t                                     GroupID;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_MEDIA_OBJECT_GRPID_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN8_MEDIA_OBJECT_GRPID * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MediaCommandPipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->MediaCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->EndofThreadGroup, 23, 23) |
+      __gen_field(values->ForceDestination, 22, 22) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->SliceDestinationSelect, 19, 20) |
+      __gen_field(values->SubSliceDestinationSelect, 17, 18) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->IndirectDataStartAddress, dw3);
+
+   dw[4] =
+      __gen_field(values->ScoreboardY, 16, 24) |
+      __gen_field(values->ScoreboardX, 0, 8) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardColor, 16, 19) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->GroupID, 0, 31) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_MEDIA_OBJECT_PRT_length_bias 0x00000002
+#define GEN8_MEDIA_OBJECT_PRT_header            \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  2,                  \
+   .DwordLength          = 14
+
+#define GEN8_MEDIA_OBJECT_PRT_length 0x00000010
+
+struct GEN8_MEDIA_OBJECT_PRT {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+   bool                                         PRT_FenceNeeded;
+#define     Rootthreadqueue                                    0
+#define     VFEstateflush                                      1
+   uint32_t                                     PRT_FenceType;
+   uint32_t                                     InlineData[12];
+};
+
+static inline void
+GEN8_MEDIA_OBJECT_PRT_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_MEDIA_OBJECT_PRT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->PRT_FenceNeeded, 23, 23) |
+      __gen_field(values->PRT_FenceType, 22, 22) |
+      0;
+
+   dw[3] =
+      0;
+
+   for (uint32_t i = 0, j = 4; i < 12; i += 1, j++) {
+      dw[j] =
+         __gen_field(values->InlineData[i + 0], 0, 31) |
+         0;
+   }
+
+}
+
+#define GEN8_MEDIA_OBJECT_WALKER_length_bias 0x00000002
+#define GEN8_MEDIA_OBJECT_WALKER_header         \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  3
+
+#define GEN8_MEDIA_OBJECT_WALKER_length 0x00000000
+
+struct GEN8_MEDIA_OBJECT_WALKER {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+#define     Nothreadsynchronization                            0
+#define     Threaddispatchissynchronizedbythespawnrootthreadmessage       1
+   uint32_t                                     ThreadSynchronization;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+   uint32_t                                     IndirectDataLength;
+   uint32_t                                     IndirectDataStartAddress;
+   uint32_t                                     GroupIDLoopSelect;
+   bool                                         ScoreboardMask;
+   uint32_t                                     ColorCountMinusOne;
+   uint32_t                                     MiddleLoopExtraSteps;
+   uint32_t                                     LocalMidLoopUnitY;
+   uint32_t                                     MidLoopUnitX;
+   uint32_t                                     GlobalLoopExecCount;
+   uint32_t                                     LocalLoopExecCount;
+   uint32_t                                     BlockResolutionY;
+   uint32_t                                     BlockResolutionX;
+   uint32_t                                     LocalStartY;
+   uint32_t                                     LocalStartX;
+   uint32_t                                     LocalOuterLoopStrideY;
+   uint32_t                                     LocalOuterLoopStrideX;
+   uint32_t                                     LocalInnerLoopUnitY;
+   uint32_t                                     LocalInnerLoopUnitX;
+   uint32_t                                     GlobalResolutionY;
+   uint32_t                                     GlobalResolutionX;
+   uint32_t                                     GlobalStartY;
+   uint32_t                                     GlobalStartX;
+   uint32_t                                     GlobalOuterLoopStrideY;
+   uint32_t                                     GlobalOuterLoopStrideX;
+   uint32_t                                     GlobalInnerLoopUnitY;
+   uint32_t                                     GlobalInnerLoopUnitX;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_MEDIA_OBJECT_WALKER_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_MEDIA_OBJECT_WALKER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->ThreadSynchronization, 24, 24) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->IndirectDataStartAddress, 0, 31) |
+      0;
+
+   dw[4] =
+      0;
+
+   dw[5] =
+      __gen_field(values->GroupIDLoopSelect, 8, 31) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ColorCountMinusOne, 24, 27) |
+      __gen_field(values->MiddleLoopExtraSteps, 16, 20) |
+      __gen_field(values->LocalMidLoopUnitY, 12, 13) |
+      __gen_field(values->MidLoopUnitX, 8, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->GlobalLoopExecCount, 16, 25) |
+      __gen_field(values->LocalLoopExecCount, 0, 9) |
+      0;
+
+   dw[8] =
+      __gen_field(values->BlockResolutionY, 16, 24) |
+      __gen_field(values->BlockResolutionX, 0, 8) |
+      0;
+
+   dw[9] =
+      __gen_field(values->LocalStartY, 16, 24) |
+      __gen_field(values->LocalStartX, 0, 8) |
+      0;
+
+   dw[10] =
+      0;
+
+   dw[11] =
+      __gen_field(values->LocalOuterLoopStrideY, 16, 25) |
+      __gen_field(values->LocalOuterLoopStrideX, 0, 9) |
+      0;
+
+   dw[12] =
+      __gen_field(values->LocalInnerLoopUnitY, 16, 25) |
+      __gen_field(values->LocalInnerLoopUnitX, 0, 9) |
+      0;
+
+   dw[13] =
+      __gen_field(values->GlobalResolutionY, 16, 24) |
+      __gen_field(values->GlobalResolutionX, 0, 8) |
+      0;
+
+   dw[14] =
+      __gen_field(values->GlobalStartY, 16, 25) |
+      __gen_field(values->GlobalStartX, 0, 9) |
+      0;
+
+   dw[15] =
+      __gen_field(values->GlobalOuterLoopStrideY, 16, 25) |
+      __gen_field(values->GlobalOuterLoopStrideX, 0, 9) |
+      0;
+
+   dw[16] =
+      __gen_field(values->GlobalInnerLoopUnitY, 16, 25) |
+      __gen_field(values->GlobalInnerLoopUnitX, 0, 9) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_MEDIA_STATE_FLUSH_length_bias 0x00000002
+#define GEN8_MEDIA_STATE_FLUSH_header           \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  4,                  \
+   .DwordLength          =  0
+
+#define GEN8_MEDIA_STATE_FLUSH_length 0x00000002
+
+struct GEN8_MEDIA_STATE_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         FlushtoGO;
+   uint32_t                                     WatermarkRequired;
+   uint32_t                                     InterfaceDescriptorOffset;
+};
+
+static inline void
+GEN8_MEDIA_STATE_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN8_MEDIA_STATE_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->FlushtoGO, 7, 7) |
+      __gen_field(values->WatermarkRequired, 6, 6) |
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+}
+
+#define GEN8_MEDIA_VFE_STATE_length_bias 0x00000002
+#define GEN8_MEDIA_VFE_STATE_header             \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  0,                  \
+   .DwordLength          =  7
+
+#define GEN8_MEDIA_VFE_STATE_length 0x00000009
+
+struct GEN8_MEDIA_VFE_STATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     StackSize;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     ScratchSpaceBasePointerHigh;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     NumberofURBEntries;
+#define     Maintainingtheexistingtimestampstate               0
+#define     Resettingrelativetimerandlatchingtheglobaltimestamp       1
+   uint32_t                                     ResetGatewayTimer;
+#define     MaintainingOpenGatewayForwardMsgCloseGatewayprotocollegacymode       0
+#define     BypassingOpenGatewayCloseGatewayprotocol           1
+   uint32_t                                     BypassGatewayControl;
+   uint32_t                                     SliceDisable;
+   uint32_t                                     URBEntryAllocationSize;
+   uint32_t                                     CURBEAllocationSize;
+#define     Scoreboarddisabled                                 0
+#define     Scoreboardenabled                                  1
+   uint32_t                                     ScoreboardEnable;
+#define     StallingScoreboard                                 0
+#define     NonStallingScoreboard                              1
+   uint32_t                                     ScoreboardType;
+   uint32_t                                     ScoreboardMask;
+   uint32_t                                     Scoreboard3DeltaY;
+   uint32_t                                     Scoreboard3DeltaX;
+   uint32_t                                     Scoreboard2DeltaY;
+   uint32_t                                     Scoreboard2DeltaX;
+   uint32_t                                     Scoreboard1DeltaY;
+   uint32_t                                     Scoreboard1DeltaX;
+   uint32_t                                     Scoreboard0DeltaY;
+   uint32_t                                     Scoreboard0DeltaX;
+   uint32_t                                     Scoreboard7DeltaY;
+   uint32_t                                     Scoreboard7DeltaX;
+   uint32_t                                     Scoreboard6DeltaY;
+   uint32_t                                     Scoreboard6DeltaX;
+   uint32_t                                     Scoreboard5DeltaY;
+   uint32_t                                     Scoreboard5DeltaX;
+   uint32_t                                     Scoreboard4DeltaY;
+   uint32_t                                     Scoreboard4DeltaX;
+};
+
+static inline void
+GEN8_MEDIA_VFE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN8_MEDIA_VFE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->StackSize, 4, 7) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->ScratchSpaceBasePointerHigh, 0, 15) |
+      0;
+
+   dw[3] =
+      __gen_field(values->MaximumNumberofThreads, 16, 31) |
+      __gen_field(values->NumberofURBEntries, 8, 15) |
+      __gen_field(values->ResetGatewayTimer, 7, 7) |
+      __gen_field(values->BypassGatewayControl, 6, 6) |
+      0;
+
+   dw[4] =
+      __gen_field(values->SliceDisable, 0, 1) |
+      0;
+
+   dw[5] =
+      __gen_field(values->URBEntryAllocationSize, 16, 31) |
+      __gen_field(values->CURBEAllocationSize, 0, 15) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ScoreboardEnable, 31, 31) |
+      __gen_field(values->ScoreboardType, 30, 30) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[7] =
+      __gen_field(values->Scoreboard3DeltaY, 28, 31) |
+      __gen_field(values->Scoreboard3DeltaX, 24, 27) |
+      __gen_field(values->Scoreboard2DeltaY, 20, 23) |
+      __gen_field(values->Scoreboard2DeltaX, 16, 19) |
+      __gen_field(values->Scoreboard1DeltaY, 12, 15) |
+      __gen_field(values->Scoreboard1DeltaX, 8, 11) |
+      __gen_field(values->Scoreboard0DeltaY, 4, 7) |
+      __gen_field(values->Scoreboard0DeltaX, 0, 3) |
+      0;
+
+   dw[8] =
+      __gen_field(values->Scoreboard7DeltaY, 28, 31) |
+      __gen_field(values->Scoreboard7DeltaX, 24, 27) |
+      __gen_field(values->Scoreboard6DeltaY, 20, 23) |
+      __gen_field(values->Scoreboard6DeltaX, 16, 19) |
+      __gen_field(values->Scoreboard5DeltaY, 12, 15) |
+      __gen_field(values->Scoreboard5DeltaX, 8, 11) |
+      __gen_field(values->Scoreboard4DeltaY, 4, 7) |
+      __gen_field(values->Scoreboard4DeltaX, 0, 3) |
+      0;
+
+}
+
+#define GEN8_MI_ARB_CHECK_length_bias 0x00000001
+#define GEN8_MI_ARB_CHECK_header                \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  5
+
+#define GEN8_MI_ARB_CHECK_length 0x00000001
+
+struct GEN8_MI_ARB_CHECK {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN8_MI_ARB_CHECK_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN8_MI_ARB_CHECK * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN8_MI_BATCH_BUFFER_END_length_bias 0x00000001
+#define GEN8_MI_BATCH_BUFFER_END_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 10
+
+#define GEN8_MI_BATCH_BUFFER_END_length 0x00000001
+
+struct GEN8_MI_BATCH_BUFFER_END {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN8_MI_BATCH_BUFFER_END_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_MI_BATCH_BUFFER_END * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN8_MI_BATCH_BUFFER_START_length_bias 0x00000002
+#define GEN8_MI_BATCH_BUFFER_START_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 49,                  \
+   .DwordLength          =  1
+
+#define GEN8_MI_BATCH_BUFFER_START_length 0x00000003
+
+struct GEN8_MI_BATCH_BUFFER_START {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     _1stlevelbatch                                     0
+#define     _2ndlevelbatch                                     1
+   uint32_t                                     _2ndLevelBatchBuffer;
+   bool                                         AddOffsetEnable;
+   uint32_t                                     PredicationEnable;
+   bool                                         ResourceStreamerEnable;
+#define     ASI_GGTT                                           0
+#define     ASI_PPGTT                                          1
+   uint32_t                                     AddressSpaceIndicator;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BatchBufferStartAddress;
+};
+
+static inline void
+GEN8_MI_BATCH_BUFFER_START_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN8_MI_BATCH_BUFFER_START * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->_2ndLevelBatchBuffer, 22, 22) |
+      __gen_field(values->AddOffsetEnable, 16, 16) |
+      __gen_field(values->PredicationEnable, 15, 15) |
+      __gen_field(values->ResourceStreamerEnable, 10, 10) |
+      __gen_field(values->AddressSpaceIndicator, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->BatchBufferStartAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+}
+
+#define GEN8_MI_CLFLUSH_length_bias 0x00000002
+#define GEN8_MI_CLFLUSH_header                  \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 39
+
+#define GEN8_MI_CLFLUSH_length 0x00000000
+
+struct GEN8_MI_CLFLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           PageBaseAddress;
+   uint32_t                                     StartingCachelineOffset;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_MI_CLFLUSH_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN8_MI_CLFLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 9) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->StartingCachelineOffset, 6, 11) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->PageBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_MI_CONDITIONAL_BATCH_BUFFER_END_length_bias 0x00000002
+#define GEN8_MI_CONDITIONAL_BATCH_BUFFER_END_header\
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 54,                  \
+   .UseGlobalGTT         =  0,                  \
+   .CompareSemaphore     =  0,                  \
+   .DwordLength          =  1
+
+#define GEN8_MI_CONDITIONAL_BATCH_BUFFER_END_length 0x00000003
+
+struct GEN8_MI_CONDITIONAL_BATCH_BUFFER_END {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     CompareSemaphore;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CompareDataDword;
+   __gen_address_type                           CompareAddress;
+};
+
+static inline void
+GEN8_MI_CONDITIONAL_BATCH_BUFFER_END_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN8_MI_CONDITIONAL_BATCH_BUFFER_END * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->CompareSemaphore, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->CompareDataDword, 0, 31) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->CompareAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN8_MI_COPY_MEM_MEM_length_bias 0x00000002
+#define GEN8_MI_COPY_MEM_MEM_header             \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 46,                  \
+   .DwordLength          =  3
+
+#define GEN8_MI_COPY_MEM_MEM_length 0x00000005
+
+struct GEN8_MI_COPY_MEM_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTTSource;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTTDestination;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           DestinationMemoryAddress;
+   __gen_address_type                           SourceMemoryAddress;
+};
+
+static inline void
+GEN8_MI_COPY_MEM_MEM_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN8_MI_COPY_MEM_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTTSource, 22, 22) |
+      __gen_field(values->UseGlobalGTTDestination, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->DestinationMemoryAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   uint32_t dw3 =
+      0;
+
+   uint64_t qw3 =
+      __gen_combine_address(data, &dw[3], values->SourceMemoryAddress, dw3);
+
+   dw[3] = qw3;
+   dw[4] = qw3 >> 32;
+
+}
+
+#define GEN8_MI_LOAD_REGISTER_IMM_length_bias 0x00000002
+#define GEN8_MI_LOAD_REGISTER_IMM_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 34,                  \
+   .DwordLength          =  1
+
+#define GEN8_MI_LOAD_REGISTER_IMM_length 0x00000003
+
+struct GEN8_MI_LOAD_REGISTER_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     ByteWriteDisables;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterOffset;
+   uint32_t                                     DataDWord;
+};
+
+static inline void
+GEN8_MI_LOAD_REGISTER_IMM_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_MI_LOAD_REGISTER_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ByteWriteDisables, 8, 11) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterOffset, 2, 22) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DataDWord, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MI_LOAD_REGISTER_MEM_length_bias 0x00000002
+#define GEN8_MI_LOAD_REGISTER_MEM_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 41,                  \
+   .DwordLength          =  2
+
+#define GEN8_MI_LOAD_REGISTER_MEM_length 0x00000004
+
+struct GEN8_MI_LOAD_REGISTER_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     AsyncModeEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN8_MI_LOAD_REGISTER_MEM_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_MI_LOAD_REGISTER_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->AsyncModeEnable, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterAddress, 2, 22) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN8_MI_LOAD_SCAN_LINES_EXCL_length_bias 0x00000002
+#define GEN8_MI_LOAD_SCAN_LINES_EXCL_header     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 19,                  \
+   .DwordLength          =  0
+
+#define GEN8_MI_LOAD_SCAN_LINES_EXCL_length 0x00000002
+
+struct GEN8_MI_LOAD_SCAN_LINES_EXCL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     DisplayPlaneA                                      0
+#define     DisplayPlaneB                                      1
+#define     DisplayPlaneC                                      4
+   uint32_t                                     DisplayPlaneSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StartScanLineNumber;
+   uint32_t                                     EndScanLineNumber;
+};
+
+static inline void
+GEN8_MI_LOAD_SCAN_LINES_EXCL_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN8_MI_LOAD_SCAN_LINES_EXCL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPlaneSelect, 19, 21) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StartScanLineNumber, 16, 28) |
+      __gen_field(values->EndScanLineNumber, 0, 12) |
+      0;
+
+}
+
+#define GEN8_MI_LOAD_SCAN_LINES_INCL_length_bias 0x00000002
+#define GEN8_MI_LOAD_SCAN_LINES_INCL_header     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 18,                  \
+   .DwordLength          =  0
+
+#define GEN8_MI_LOAD_SCAN_LINES_INCL_length 0x00000002
+
+struct GEN8_MI_LOAD_SCAN_LINES_INCL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     DisplayPlaneA                                      0
+#define     DisplayPlaneB                                      1
+#define     DisplayPlaneC                                      4
+   uint32_t                                     DisplayPlaneSelect;
+#define     NeverForward                                       0
+#define     AlwaysForward                                      1
+#define     ConditionallyForward                               2
+   bool                                         ScanLineEventDoneForward;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StartScanLineNumber;
+   uint32_t                                     EndScanLineNumber;
+};
+
+static inline void
+GEN8_MI_LOAD_SCAN_LINES_INCL_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN8_MI_LOAD_SCAN_LINES_INCL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPlaneSelect, 19, 21) |
+      __gen_field(values->ScanLineEventDoneForward, 17, 18) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StartScanLineNumber, 16, 28) |
+      __gen_field(values->EndScanLineNumber, 0, 12) |
+      0;
+
+}
+
+#define GEN8_MI_LOAD_URB_MEM_length_bias 0x00000002
+#define GEN8_MI_LOAD_URB_MEM_header             \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 44,                  \
+   .DwordLength          =  2
+
+#define GEN8_MI_LOAD_URB_MEM_length 0x00000004
+
+struct GEN8_MI_LOAD_URB_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN8_MI_LOAD_URB_MEM_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN8_MI_LOAD_URB_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBAddress, 2, 14) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN8_MI_MATH_length_bias 0x00000002
+#define GEN8_MI_MATH_header                     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 26
+
+#define GEN8_MI_MATH_length 0x00000000
+
+struct GEN8_MI_MATH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ALUINSTRUCTION1;
+   uint32_t                                     ALUINSTRUCTION2;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_MI_MATH_pack(__gen_user_data *data, void * restrict dst,
+                  const struct GEN8_MI_MATH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ALUINSTRUCTION1, 0, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ALUINSTRUCTION2, 0, 31) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN8_MI_NOOP_length_bias 0x00000001
+#define GEN8_MI_NOOP_header                     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  0
+
+#define GEN8_MI_NOOP_length 0x00000001
+
+struct GEN8_MI_NOOP {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         IdentificationNumberRegisterWriteEnable;
+   uint32_t                                     IdentificationNumber;
+};
+
+static inline void
+GEN8_MI_NOOP_pack(__gen_user_data *data, void * restrict dst,
+                  const struct GEN8_MI_NOOP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->IdentificationNumberRegisterWriteEnable, 22, 22) |
+      __gen_field(values->IdentificationNumber, 0, 21) |
+      0;
+
+}
+
+#define GEN8_MI_PREDICATE_length_bias 0x00000001
+#define GEN8_MI_PREDICATE_header                \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 12
+
+#define GEN8_MI_PREDICATE_length 0x00000001
+
+struct GEN8_MI_PREDICATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     LOAD_KEEP                                          0
+#define     LOAD_LOAD                                          2
+#define     LOAD_LOADINV                                       3
+   uint32_t                                     LoadOperation;
+#define     COMBINE_SET                                        0
+#define     COMBINE_AND                                        1
+#define     COMBINE_OR                                         2
+#define     COMBINE_XOR                                        3
+   uint32_t                                     CombineOperation;
+#define     COMPARE_SRCS_EQUAL                                 2
+#define     COMPARE_DELTAS_EQUAL                               3
+   uint32_t                                     CompareOperation;
+};
+
+static inline void
+GEN8_MI_PREDICATE_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN8_MI_PREDICATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->LoadOperation, 6, 7) |
+      __gen_field(values->CombineOperation, 3, 4) |
+      __gen_field(values->CompareOperation, 0, 1) |
+      0;
+
+}
+
+#define GEN8_MI_REPORT_HEAD_length_bias 0x00000001
+#define GEN8_MI_REPORT_HEAD_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  7
+
+#define GEN8_MI_REPORT_HEAD_length 0x00000001
+
+struct GEN8_MI_REPORT_HEAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN8_MI_REPORT_HEAD_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN8_MI_REPORT_HEAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN8_MI_RS_CONTEXT_length_bias 0x00000001
+#define GEN8_MI_RS_CONTEXT_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 15
+
+#define GEN8_MI_RS_CONTEXT_length 0x00000001
+
+struct GEN8_MI_RS_CONTEXT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     RS_RESTORE                                         0
+#define     RS_SAVE                                            1
+   uint32_t                                     ResourceStreamerSave;
+};
+
+static inline void
+GEN8_MI_RS_CONTEXT_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN8_MI_RS_CONTEXT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ResourceStreamerSave, 0, 0) |
+      0;
+
+}
+
+#define GEN8_MI_RS_CONTROL_length_bias 0x00000001
+#define GEN8_MI_RS_CONTROL_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  6
+
+#define GEN8_MI_RS_CONTROL_length 0x00000001
+
+struct GEN8_MI_RS_CONTROL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     RS_STOP                                            0
+#define     RS_START                                           1
+   uint32_t                                     ResourceStreamerControl;
+};
+
+static inline void
+GEN8_MI_RS_CONTROL_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN8_MI_RS_CONTROL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ResourceStreamerControl, 0, 0) |
+      0;
+
+}
+
+#define GEN8_MI_RS_STORE_DATA_IMM_length_bias 0x00000002
+#define GEN8_MI_RS_STORE_DATA_IMM_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 43,                  \
+   .DwordLength          =  2
+
+#define GEN8_MI_RS_STORE_DATA_IMM_length 0x00000004
+
+struct GEN8_MI_RS_STORE_DATA_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           DestinationAddress;
+   uint32_t                                     CoreModeEnable;
+   uint32_t                                     DataDWord0;
+};
+
+static inline void
+GEN8_MI_RS_STORE_DATA_IMM_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_MI_RS_STORE_DATA_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->CoreModeEnable, 0, 0) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->DestinationAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MI_SET_CONTEXT_length_bias 0x00000002
+#define GEN8_MI_SET_CONTEXT_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 24,                  \
+   .DwordLength          =  0
+
+#define GEN8_MI_SET_CONTEXT_length 0x00000002
+
+struct GEN8_MI_SET_CONTEXT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           LogicalContextAddress;
+   uint32_t                                     ReservedMustbe1;
+   bool                                         CoreModeEnable;
+   bool                                         ResourceStreamerStateSaveEnable;
+   bool                                         ResourceStreamerStateRestoreEnable;
+   uint32_t                                     ForceRestore;
+   uint32_t                                     RestoreInhibit;
+};
+
+static inline void
+GEN8_MI_SET_CONTEXT_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN8_MI_SET_CONTEXT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->ReservedMustbe1, 8, 8) |
+      __gen_field(values->CoreModeEnable, 4, 4) |
+      __gen_field(values->ResourceStreamerStateSaveEnable, 3, 3) |
+      __gen_field(values->ResourceStreamerStateRestoreEnable, 2, 2) |
+      __gen_field(values->ForceRestore, 1, 1) |
+      __gen_field(values->RestoreInhibit, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->LogicalContextAddress, dw1);
+
+}
+
+#define GEN8_MI_SET_PREDICATE_length_bias 0x00000001
+#define GEN8_MI_SET_PREDICATE_header            \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  1
+
+#define GEN8_MI_SET_PREDICATE_length 0x00000001
+
+struct GEN8_MI_SET_PREDICATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     NOOPNever                                          0
+#define     NOOPonResult2clear                                 1
+#define     NOOPonResult2set                                   2
+#define     NOOPonResultclear                                  3
+#define     NOOPonResultset                                    4
+#define     Executewhenonesliceenabled                         5
+#define     Executewhentwoslicesareenabled                     6
+#define     Executewhenthreeslicesareenabled                   7
+#define     NOOPAlways                                        15
+   uint32_t                                     PREDICATEENABLE;
+};
+
+static inline void
+GEN8_MI_SET_PREDICATE_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_MI_SET_PREDICATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->PREDICATEENABLE, 0, 3) |
+      0;
+
+}
+
+#define GEN8_MI_STORE_DATA_IMM_length_bias 0x00000002
+#define GEN8_MI_STORE_DATA_IMM_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 32,                  \
+   .DwordLength          =  2
+
+#define GEN8_MI_STORE_DATA_IMM_length 0x00000004
+
+struct GEN8_MI_STORE_DATA_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   bool                                         StoreQword;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           Address;
+   uint32_t                                     CoreModeEnable;
+   uint32_t                                     DataDWord0;
+   uint32_t                                     DataDWord1;
+};
+
+static inline void
+GEN8_MI_STORE_DATA_IMM_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN8_MI_STORE_DATA_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->StoreQword, 21, 21) |
+      __gen_field(values->DwordLength, 0, 9) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->CoreModeEnable, 0, 0) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->Address, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->DataDWord1, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MI_STORE_DATA_INDEX_length_bias 0x00000002
+#define GEN8_MI_STORE_DATA_INDEX_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 33,                  \
+   .DwordLength          =  1
+
+#define GEN8_MI_STORE_DATA_INDEX_length 0x00000003
+
+struct GEN8_MI_STORE_DATA_INDEX {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     UsePerProcessHardwareStatusPage;
+   uint32_t                                     DwordLength;
+   uint32_t                                     Offset;
+   uint32_t                                     DataDWord0;
+   uint32_t                                     DataDWord1;
+};
+
+static inline void
+GEN8_MI_STORE_DATA_INDEX_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_MI_STORE_DATA_INDEX * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UsePerProcessHardwareStatusPage, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Offset, 2, 11) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DataDWord1, 0, 31) |
+      0;
+
+}
+
+#define GEN8_MI_STORE_URB_MEM_length_bias 0x00000002
+#define GEN8_MI_STORE_URB_MEM_header            \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 45,                  \
+   .DwordLength          =  2
+
+#define GEN8_MI_STORE_URB_MEM_length 0x00000004
+
+struct GEN8_MI_STORE_URB_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN8_MI_STORE_URB_MEM_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_MI_STORE_URB_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBAddress, 2, 14) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN8_MI_SUSPEND_FLUSH_length_bias 0x00000001
+#define GEN8_MI_SUSPEND_FLUSH_header            \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 11
+
+#define GEN8_MI_SUSPEND_FLUSH_length 0x00000001
+
+struct GEN8_MI_SUSPEND_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         SuspendFlush;
+};
+
+static inline void
+GEN8_MI_SUSPEND_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_MI_SUSPEND_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->SuspendFlush, 0, 0) |
+      0;
+
+}
+
+#define GEN8_MI_TOPOLOGY_FILTER_length_bias 0x00000001
+#define GEN8_MI_TOPOLOGY_FILTER_header          \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 13
+
+#define GEN8_MI_TOPOLOGY_FILTER_length 0x00000001
+
+struct GEN8_MI_TOPOLOGY_FILTER {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     TopologyFilterValue;
+};
+
+static inline void
+GEN8_MI_TOPOLOGY_FILTER_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN8_MI_TOPOLOGY_FILTER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->TopologyFilterValue, 0, 5) |
+      0;
+
+}
+
+#define GEN8_MI_UPDATE_GTT_length_bias 0x00000002
+#define GEN8_MI_UPDATE_GTT_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 35
+
+#define GEN8_MI_UPDATE_GTT_length 0x00000000
+
+struct GEN8_MI_UPDATE_GTT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           EntryAddress;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN8_MI_UPDATE_GTT_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN8_MI_UPDATE_GTT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 9) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->EntryAddress, dw1);
+
+   /* variable length fields follow */
+}
+
+#define GEN8_MI_URB_ATOMIC_ALLOC_length_bias 0x00000001
+#define GEN8_MI_URB_ATOMIC_ALLOC_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  9
+
+#define GEN8_MI_URB_ATOMIC_ALLOC_length 0x00000001
+
+struct GEN8_MI_URB_ATOMIC_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     URBAtomicStorageOffset;
+   uint32_t                                     URBAtomicStorageSize;
+};
+
+static inline void
+GEN8_MI_URB_ATOMIC_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_MI_URB_ATOMIC_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->URBAtomicStorageOffset, 12, 19) |
+      __gen_field(values->URBAtomicStorageSize, 0, 8) |
+      0;
+
+}
+
+#define GEN8_MI_URB_CLEAR_length_bias 0x00000002
+#define GEN8_MI_URB_CLEAR_header                \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 25,                  \
+   .DwordLength          =  0
+
+#define GEN8_MI_URB_CLEAR_length 0x00000002
+
+struct GEN8_MI_URB_CLEAR {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBClearLength;
+   uint32_t                                     URBAddress;
+};
+
+static inline void
+GEN8_MI_URB_CLEAR_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN8_MI_URB_CLEAR * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBClearLength, 16, 29) |
+      __gen_offset(values->URBAddress, 0, 14) |
+      0;
+
+}
+
+#define GEN8_MI_USER_INTERRUPT_length_bias 0x00000001
+#define GEN8_MI_USER_INTERRUPT_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  2
+
+#define GEN8_MI_USER_INTERRUPT_length 0x00000001
+
+struct GEN8_MI_USER_INTERRUPT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN8_MI_USER_INTERRUPT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN8_MI_USER_INTERRUPT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN8_MI_WAIT_FOR_EVENT_length_bias 0x00000001
+#define GEN8_MI_WAIT_FOR_EVENT_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  3
+
+#define GEN8_MI_WAIT_FOR_EVENT_length 0x00000001
+
+struct GEN8_MI_WAIT_FOR_EVENT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         DisplayPipeCVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteCFlipPendingWaitEnable;
+   bool                                         DisplayPlaneCFlipPendingWaitEnable;
+   bool                                         DisplayPipeCScanLineWaitEnable;
+   bool                                         DisplayPipeBVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteBFlipPendingWaitEnable;
+   bool                                         DisplayPlaneBFlipPendingWaitEnable;
+   bool                                         DisplayPipeBScanLineWaitEnable;
+   bool                                         DisplayPipeAVerticalBlankWaitEnable;
+   bool                                         DisplaySpriteAFlipPendingWaitEnable;
+   bool                                         DisplayPlaneAFlipPendingWaitEnable;
+   bool                                         DisplayPipeAScanLineWaitEnable;
+};
+
+static inline void
+GEN8_MI_WAIT_FOR_EVENT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN8_MI_WAIT_FOR_EVENT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPipeCVerticalBlankWaitEnable, 21, 21) |
+      __gen_field(values->DisplaySpriteCFlipPendingWaitEnable, 20, 20) |
+      __gen_field(values->DisplayPlaneCFlipPendingWaitEnable, 15, 15) |
+      __gen_field(values->DisplayPipeCScanLineWaitEnable, 14, 14) |
+      __gen_field(values->DisplayPipeBVerticalBlankWaitEnable, 11, 11) |
+      __gen_field(values->DisplaySpriteBFlipPendingWaitEnable, 10, 10) |
+      __gen_field(values->DisplayPlaneBFlipPendingWaitEnable, 9, 9) |
+      __gen_field(values->DisplayPipeBScanLineWaitEnable, 8, 8) |
+      __gen_field(values->DisplayPipeAVerticalBlankWaitEnable, 3, 3) |
+      __gen_field(values->DisplaySpriteAFlipPendingWaitEnable, 2, 2) |
+      __gen_field(values->DisplayPlaneAFlipPendingWaitEnable, 1, 1) |
+      __gen_field(values->DisplayPipeAScanLineWaitEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN8_PIPE_CONTROL_length_bias 0x00000002
+#define GEN8_PIPE_CONTROL_header                \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  2,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  4
+
+#define GEN8_PIPE_CONTROL_length 0x00000006
+
+struct GEN8_PIPE_CONTROL {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     DAT_PPGTT                                          0
+#define     DAT_GGTT                                           1
+   uint32_t                                     DestinationAddressType;
+#define     NoLRIOperation                                     0
+#define     MMIOWriteImmediateData                             1
+   uint32_t                                     LRIPostSyncOperation;
+   uint32_t                                     StoreDataIndex;
+   uint32_t                                     CommandStreamerStallEnable;
+#define     DontReset                                          0
+#define     Reset                                              1
+   uint32_t                                     GlobalSnapshotCountReset;
+   uint32_t                                     TLBInvalidate;
+   bool                                         GenericMediaStateClear;
+#define     NoWrite                                            0
+#define     WriteImmediateData                                 1
+#define     WritePSDepthCount                                  2
+#define     WriteTimestamp                                     3
+   uint32_t                                     PostSyncOperation;
+   bool                                         DepthStallEnable;
+#define     DisableFlush                                       0
+#define     EnableFlush                                        1
+   bool                                         RenderTargetCacheFlushEnable;
+   bool                                         InstructionCacheInvalidateEnable;
+   bool                                         TextureCacheInvalidationEnable;
+   bool                                         IndirectStatePointersDisable;
+   bool                                         NotifyEnable;
+   bool                                         PipeControlFlushEnable;
+   bool                                         DCFlushEnable;
+   bool                                         VFCacheInvalidationEnable;
+   bool                                         ConstantCacheInvalidationEnable;
+   bool                                         StateCacheInvalidationEnable;
+   bool                                         StallAtPixelScoreboard;
+#define     FlushDisabled                                      0
+#define     FlushEnabled                                       1
+   bool                                         DepthCacheFlushEnable;
+   __gen_address_type                           Address;
+   uint64_t                                     ImmediateData;
+};
+
+static inline void
+GEN8_PIPE_CONTROL_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN8_PIPE_CONTROL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DestinationAddressType, 24, 24) |
+      __gen_field(values->LRIPostSyncOperation, 23, 23) |
+      __gen_field(values->StoreDataIndex, 21, 21) |
+      __gen_field(values->CommandStreamerStallEnable, 20, 20) |
+      __gen_field(values->GlobalSnapshotCountReset, 19, 19) |
+      __gen_field(values->TLBInvalidate, 18, 18) |
+      __gen_field(values->GenericMediaStateClear, 16, 16) |
+      __gen_field(values->PostSyncOperation, 14, 15) |
+      __gen_field(values->DepthStallEnable, 13, 13) |
+      __gen_field(values->RenderTargetCacheFlushEnable, 12, 12) |
+      __gen_field(values->InstructionCacheInvalidateEnable, 11, 11) |
+      __gen_field(values->TextureCacheInvalidationEnable, 10, 10) |
+      __gen_field(values->IndirectStatePointersDisable, 9, 9) |
+      __gen_field(values->NotifyEnable, 8, 8) |
+      __gen_field(values->PipeControlFlushEnable, 7, 7) |
+      __gen_field(values->DCFlushEnable, 5, 5) |
+      __gen_field(values->VFCacheInvalidationEnable, 4, 4) |
+      __gen_field(values->ConstantCacheInvalidationEnable, 3, 3) |
+      __gen_field(values->StateCacheInvalidationEnable, 2, 2) |
+      __gen_field(values->StallAtPixelScoreboard, 1, 1) |
+      __gen_field(values->DepthCacheFlushEnable, 0, 0) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->Address, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   uint64_t qw4 =
+      __gen_field(values->ImmediateData, 0, 63) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+}
+
+#define GEN8_SCISSOR_RECT_length 0x00000002
+
+struct GEN8_SCISSOR_RECT {
+   uint32_t                                     ScissorRectangleYMin;
+   uint32_t                                     ScissorRectangleXMin;
+   uint32_t                                     ScissorRectangleYMax;
+   uint32_t                                     ScissorRectangleXMax;
+};
+
+static inline void
+GEN8_SCISSOR_RECT_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN8_SCISSOR_RECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ScissorRectangleYMin, 16, 31) |
+      __gen_field(values->ScissorRectangleXMin, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ScissorRectangleYMax, 16, 31) |
+      __gen_field(values->ScissorRectangleXMax, 0, 15) |
+      0;
+
+}
+
+#define GEN8_SF_CLIP_VIEWPORT_length 0x00000010
+
+struct GEN8_SF_CLIP_VIEWPORT {
+   float                                        ViewportMatrixElementm00;
+   float                                        ViewportMatrixElementm11;
+   float                                        ViewportMatrixElementm22;
+   float                                        ViewportMatrixElementm30;
+   float                                        ViewportMatrixElementm31;
+   float                                        ViewportMatrixElementm32;
+   float                                        XMinClipGuardband;
+   float                                        XMaxClipGuardband;
+   float                                        YMinClipGuardband;
+   float                                        YMaxClipGuardband;
+   float                                        XMinViewPort;
+   float                                        XMaxViewPort;
+   float                                        YMinViewPort;
+   float                                        YMaxViewPort;
+};
+
+static inline void
+GEN8_SF_CLIP_VIEWPORT_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_SF_CLIP_VIEWPORT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->ViewportMatrixElementm00) |
+      0;
+
+   dw[1] =
+      __gen_float(values->ViewportMatrixElementm11) |
+      0;
+
+   dw[2] =
+      __gen_float(values->ViewportMatrixElementm22) |
+      0;
+
+   dw[3] =
+      __gen_float(values->ViewportMatrixElementm30) |
+      0;
+
+   dw[4] =
+      __gen_float(values->ViewportMatrixElementm31) |
+      0;
+
+   dw[5] =
+      __gen_float(values->ViewportMatrixElementm32) |
+      0;
+
+   dw[6] =
+      0;
+
+   dw[7] =
+      0;
+
+   dw[8] =
+      __gen_float(values->XMinClipGuardband) |
+      0;
+
+   dw[9] =
+      __gen_float(values->XMaxClipGuardband) |
+      0;
+
+   dw[10] =
+      __gen_float(values->YMinClipGuardband) |
+      0;
+
+   dw[11] =
+      __gen_float(values->YMaxClipGuardband) |
+      0;
+
+   dw[12] =
+      __gen_float(values->XMinViewPort) |
+      0;
+
+   dw[13] =
+      __gen_float(values->XMaxViewPort) |
+      0;
+
+   dw[14] =
+      __gen_float(values->YMinViewPort) |
+      0;
+
+   dw[15] =
+      __gen_float(values->YMaxViewPort) |
+      0;
+
+}
+
+#define GEN8_BLEND_STATE_length 0x00000011
+
+#define GEN8_BLEND_STATE_ENTRY_length 0x00000002
+
+struct GEN8_BLEND_STATE_ENTRY {
+   bool                                         LogicOpEnable;
+   uint32_t                                     LogicOpFunction;
+   uint32_t                                     PreBlendSourceOnlyClampEnable;
+#define     COLORCLAMP_UNORM                                   0
+#define     COLORCLAMP_SNORM                                   1
+#define     COLORCLAMP_RTFORMAT                                2
+   uint32_t                                     ColorClampRange;
+   bool                                         PreBlendColorClampEnable;
+   bool                                         PostBlendColorClampEnable;
+   bool                                         ColorBufferBlendEnable;
+   uint32_t                                     SourceBlendFactor;
+   uint32_t                                     DestinationBlendFactor;
+   uint32_t                                     ColorBlendFunction;
+   uint32_t                                     SourceAlphaBlendFactor;
+   uint32_t                                     DestinationAlphaBlendFactor;
+   uint32_t                                     AlphaBlendFunction;
+   bool                                         WriteDisableAlpha;
+   bool                                         WriteDisableRed;
+   bool                                         WriteDisableGreen;
+   bool                                         WriteDisableBlue;
+};
+
+static inline void
+GEN8_BLEND_STATE_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN8_BLEND_STATE_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint64_t qw0 =
+      __gen_field(values->LogicOpEnable, 63, 63) |
+      __gen_field(values->LogicOpFunction, 59, 62) |
+      __gen_field(values->PreBlendSourceOnlyClampEnable, 36, 36) |
+      __gen_field(values->ColorClampRange, 34, 35) |
+      __gen_field(values->PreBlendColorClampEnable, 33, 33) |
+      __gen_field(values->PostBlendColorClampEnable, 32, 32) |
+      __gen_field(values->ColorBufferBlendEnable, 31, 31) |
+      __gen_field(values->SourceBlendFactor, 26, 30) |
+      __gen_field(values->DestinationBlendFactor, 21, 25) |
+      __gen_field(values->ColorBlendFunction, 18, 20) |
+      __gen_field(values->SourceAlphaBlendFactor, 13, 17) |
+      __gen_field(values->DestinationAlphaBlendFactor, 8, 12) |
+      __gen_field(values->AlphaBlendFunction, 5, 7) |
+      __gen_field(values->WriteDisableAlpha, 3, 3) |
+      __gen_field(values->WriteDisableRed, 2, 2) |
+      __gen_field(values->WriteDisableGreen, 1, 1) |
+      __gen_field(values->WriteDisableBlue, 0, 0) |
+      0;
+
+   dw[0] = qw0;
+   dw[1] = qw0 >> 32;
+
+}
+
+struct GEN8_BLEND_STATE {
+   bool                                         AlphaToCoverageEnable;
+   bool                                         IndependentAlphaBlendEnable;
+   bool                                         AlphaToOneEnable;
+   bool                                         AlphaToCoverageDitherEnable;
+   bool                                         AlphaTestEnable;
+   uint32_t                                     AlphaTestFunction;
+   bool                                         ColorDitherEnable;
+   uint32_t                                     XDitherOffset;
+   uint32_t                                     YDitherOffset;
+   struct GEN8_BLEND_STATE_ENTRY                Entry[8];
+};
+
+static inline void
+GEN8_BLEND_STATE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN8_BLEND_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->AlphaToCoverageEnable, 31, 31) |
+      __gen_field(values->IndependentAlphaBlendEnable, 30, 30) |
+      __gen_field(values->AlphaToOneEnable, 29, 29) |
+      __gen_field(values->AlphaToCoverageDitherEnable, 28, 28) |
+      __gen_field(values->AlphaTestEnable, 27, 27) |
+      __gen_field(values->AlphaTestFunction, 24, 26) |
+      __gen_field(values->ColorDitherEnable, 23, 23) |
+      __gen_field(values->XDitherOffset, 21, 22) |
+      __gen_field(values->YDitherOffset, 19, 20) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 8; i++, j += 2)
+      GEN8_BLEND_STATE_ENTRY_pack(data, &dw[j], &values->Entry[i]);
+}
+
+#define GEN8_CC_VIEWPORT_length 0x00000002
+
+struct GEN8_CC_VIEWPORT {
+   float                                        MinimumDepth;
+   float                                        MaximumDepth;
+};
+
+static inline void
+GEN8_CC_VIEWPORT_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN8_CC_VIEWPORT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->MinimumDepth) |
+      0;
+
+   dw[1] =
+      __gen_float(values->MaximumDepth) |
+      0;
+
+}
+
+#define GEN8_COLOR_CALC_STATE_length 0x00000006
+
+struct GEN8_COLOR_CALC_STATE {
+   uint32_t                                     StencilReferenceValue;
+   uint32_t                                     BackFaceStencilReferenceValue;
+#define     Cancelled                                          0
+#define     NotCancelled                                       1
+   uint32_t                                     RoundDisableFunctionDisable;
+#define     ALPHATEST_UNORM8                                   0
+#define     ALPHATEST_FLOAT32                                  1
+   uint32_t                                     AlphaTestFormat;
+   uint32_t                                     AlphaReferenceValueAsUNORM8;
+   float                                        AlphaReferenceValueAsFLOAT32;
+   float                                        BlendConstantColorRed;
+   float                                        BlendConstantColorGreen;
+   float                                        BlendConstantColorBlue;
+   float                                        BlendConstantColorAlpha;
+};
+
+static inline void
+GEN8_COLOR_CALC_STATE_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN8_COLOR_CALC_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->StencilReferenceValue, 24, 31) |
+      __gen_field(values->BackFaceStencilReferenceValue, 16, 23) |
+      __gen_field(values->RoundDisableFunctionDisable, 15, 15) |
+      __gen_field(values->AlphaTestFormat, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AlphaReferenceValueAsUNORM8, 0, 31) |
+      __gen_float(values->AlphaReferenceValueAsFLOAT32) |
+      0;
+
+   dw[2] =
+      __gen_float(values->BlendConstantColorRed) |
+      0;
+
+   dw[3] =
+      __gen_float(values->BlendConstantColorGreen) |
+      0;
+
+   dw[4] =
+      __gen_float(values->BlendConstantColorBlue) |
+      0;
+
+   dw[5] =
+      __gen_float(values->BlendConstantColorAlpha) |
+      0;
+
+}
+
+#define GEN8_BLACK_LEVEL_CORRECTION_STATE__DW7576_length 0x00000002
+
+struct GEN8_BLACK_LEVEL_CORRECTION_STATE__DW7576 {
+   uint32_t                                     BlackPointOffsetR;
+   uint32_t                                     BlackPointOffsetG;
+   uint32_t                                     BlackPointOffsetB;
+};
+
+static inline void
+GEN8_BLACK_LEVEL_CORRECTION_STATE__DW7576_pack(__gen_user_data *data, void * restrict dst,
+                                               const struct GEN8_BLACK_LEVEL_CORRECTION_STATE__DW7576 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->BlackPointOffsetR, 0, 12) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BlackPointOffsetG, 13, 25) |
+      __gen_field(values->BlackPointOffsetB, 0, 12) |
+      0;
+
+}
+
+#define GEN8_INTERFACE_DESCRIPTOR_DATA_length 0x00000008
+
+struct GEN8_INTERFACE_DESCRIPTOR_DATA {
+   uint32_t                                     KernelStartPointer;
+   uint32_t                                     KernelStartPointerHigh;
+#define     Ftz                                                0
+#define     SetByKernel                                        1
+   uint32_t                                     DenormMode;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleProgramFlow;
+#define     NormalPriority                                     0
+#define     HighPriority                                       1
+   uint32_t                                     ThreadPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     SamplerStatePointer;
+#define     Nosamplersused                                     0
+#define     Between1and4samplersused                           1
+#define     Between5and8samplersused                           2
+#define     Between9and12samplersused                          3
+#define     Between13and16samplersused                         4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTablePointer;
+   uint32_t                                     BindingTableEntryCount;
+   uint32_t                                     ConstantIndirectURBEntryReadLength;
+   uint32_t                                     ConstantURBEntryReadOffset;
+#define     RTNE                                               0
+#define     RU                                                 1
+#define     RD                                                 2
+#define     RTZ                                                3
+   uint32_t                                     RoundingMode;
+   bool                                         BarrierEnable;
+#define     Encodes0k                                          0
+#define     Encodes4k                                          1
+#define     Encodes8k                                          2
+#define     Encodes16k                                         4
+#define     Encodes32k                                         8
+#define     Encodes64k                                        16
+   uint32_t                                     SharedLocalMemorySize;
+   uint32_t                                     NumberofThreadsinGPGPUThreadGroup;
+   uint32_t                                     CrossThreadConstantDataReadLength;
+};
+
+static inline void
+GEN8_INTERFACE_DESCRIPTOR_DATA_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN8_INTERFACE_DESCRIPTOR_DATA * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointerHigh, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DenormMode, 19, 19) |
+      __gen_field(values->SingleProgramFlow, 18, 18) |
+      __gen_field(values->ThreadPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->SamplerStatePointer, 5, 31) |
+      __gen_field(values->SamplerCount, 2, 4) |
+      0;
+
+   dw[4] =
+      __gen_offset(values->BindingTablePointer, 5, 15) |
+      __gen_field(values->BindingTableEntryCount, 0, 4) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ConstantIndirectURBEntryReadLength, 16, 31) |
+      __gen_field(values->ConstantURBEntryReadOffset, 0, 15) |
+      0;
+
+   dw[6] =
+      __gen_field(values->RoundingMode, 22, 23) |
+      __gen_field(values->BarrierEnable, 21, 21) |
+      __gen_field(values->SharedLocalMemorySize, 16, 20) |
+      __gen_field(values->NumberofThreadsinGPGPUThreadGroup, 0, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->CrossThreadConstantDataReadLength, 0, 7) |
+      0;
+
+}
+
+#define GEN8_BINDING_TABLE_STATE_length 0x00000001
+
+struct GEN8_BINDING_TABLE_STATE {
+   uint32_t                                     SurfaceStatePointer;
+};
+
+static inline void
+GEN8_BINDING_TABLE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN8_BINDING_TABLE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->SurfaceStatePointer, 6, 31) |
+      0;
+
+}
+
+#define GEN8_RENDER_SURFACE_STATE_length 0x00000010
+
+struct GEN8_RENDER_SURFACE_STATE {
+#define     SURFTYPE_1D                                        0
+#define     SURFTYPE_2D                                        1
+#define     SURFTYPE_3D                                        2
+#define     SURFTYPE_CUBE                                      3
+#define     SURFTYPE_BUFFER                                    4
+#define     SURFTYPE_STRBUF                                    5
+#define     SURFTYPE_NULL                                      7
+   uint32_t                                     SurfaceType;
+   bool                                         SurfaceArray;
+   uint32_t                                     SurfaceFormat;
+#define     VALIGN4                                            1
+#define     VALIGN8                                            2
+#define     VALIGN16                                           3
+   uint32_t                                     SurfaceVerticalAlignment;
+#define     HALIGN4                                            1
+#define     HALIGN8                                            2
+#define     HALIGN16                                           3
+   uint32_t                                     SurfaceHorizontalAlignment;
+#define     LINEAR                                             0
+#define     WMAJOR                                             1
+#define     XMAJOR                                             2
+#define     YMAJOR                                             3
+   uint32_t                                     TileMode;
+   uint32_t                                     VerticalLineStride;
+   uint32_t                                     VerticalLineStrideOffset;
+   bool                                         SamplerL2BypassModeDisable;
+#define     WriteOnlyCache                                     0
+#define     ReadWriteCache                                     1
+   uint32_t                                     RenderCacheReadWriteMode;
+#define     NORMAL_MODE                                        0
+#define     PROGRESSIVE_FRAME                                  2
+#define     INTERLACED_FRAME                                   3
+   uint32_t                                     MediaBoundaryPixelMode;
+   bool                                         CubeFaceEnablePositiveZ;
+   bool                                         CubeFaceEnableNegativeZ;
+   bool                                         CubeFaceEnablePositiveY;
+   bool                                         CubeFaceEnableNegativeY;
+   bool                                         CubeFaceEnablePositiveX;
+   bool                                         CubeFaceEnableNegativeX;
+   struct GEN8_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   float                                        BaseMipLevel;
+   uint32_t                                     SurfaceQPitch;
+   uint32_t                                     Height;
+   uint32_t                                     Width;
+   uint32_t                                     Depth;
+   uint32_t                                     SurfacePitch;
+#define     _0DEG                                              0
+#define     _90DEG                                             1
+#define     _270DEG                                            3
+   uint32_t                                     RenderTargetAndSampleUnormRotation;
+   uint32_t                                     MinimumArrayElement;
+   uint32_t                                     RenderTargetViewExtent;
+#define     MSS                                                0
+#define     DEPTH_STENCIL                                      1
+   uint32_t                                     MultisampledSurfaceStorageFormat;
+#define     MULTISAMPLECOUNT_1                                 0
+#define     MULTISAMPLECOUNT_2                                 1
+#define     MULTISAMPLECOUNT_4                                 2
+#define     MULTISAMPLECOUNT_8                                 3
+   uint32_t                                     NumberofMultisamples;
+   uint32_t                                     MultisamplePositionPaletteIndex;
+   uint32_t                                     XOffset;
+   uint32_t                                     YOffset;
+   bool                                         EWADisableForCube;
+#define     GPUcoherent                                        0
+#define     IAcoherent                                         1
+   uint32_t                                     CoherencyType;
+   uint32_t                                     SurfaceMinLOD;
+   uint32_t                                     MIPCountLOD;
+   uint32_t                                     AuxiliarySurfaceQPitch;
+   uint32_t                                     AuxiliarySurfacePitch;
+#define     AUX_NONE                                           0
+#define     AUX_MCS                                            1
+#define     AUX_APPEND                                         2
+#define     AUX_HIZ                                            3
+   uint32_t                                     AuxiliarySurfaceMode;
+   bool                                         SeparateUVPlaneEnable;
+   uint32_t                                     XOffsetforUorUVPlane;
+   uint32_t                                     YOffsetforUorUVPlane;
+   uint32_t                                     RedClearColor;
+   uint32_t                                     GreenClearColor;
+   uint32_t                                     BlueClearColor;
+   uint32_t                                     AlphaClearColor;
+   uint32_t                                     ShaderChannelSelectRed;
+   uint32_t                                     ShaderChannelSelectGreen;
+   uint32_t                                     ShaderChannelSelectBlue;
+   uint32_t                                     ShaderChannelSelectAlpha;
+   float                                        ResourceMinLOD;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     XOffsetforVPlane;
+   uint32_t                                     YOffsetforVPlane;
+   uint32_t                                     AuxiliaryTableIndexforMediaCompressedSurface;
+   __gen_address_type                           AuxiliarySurfaceBaseAddress;
+};
+
+static inline void
+GEN8_RENDER_SURFACE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN8_RENDER_SURFACE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->SurfaceType, 29, 31) |
+      __gen_field(values->SurfaceArray, 28, 28) |
+      __gen_field(values->SurfaceFormat, 18, 26) |
+      __gen_field(values->SurfaceVerticalAlignment, 16, 17) |
+      __gen_field(values->SurfaceHorizontalAlignment, 14, 15) |
+      __gen_field(values->TileMode, 12, 13) |
+      __gen_field(values->VerticalLineStride, 11, 11) |
+      __gen_field(values->VerticalLineStrideOffset, 10, 10) |
+      __gen_field(values->SamplerL2BypassModeDisable, 9, 9) |
+      __gen_field(values->RenderCacheReadWriteMode, 8, 8) |
+      __gen_field(values->MediaBoundaryPixelMode, 6, 7) |
+      __gen_field(values->CubeFaceEnablePositiveZ, 0, 0) |
+      __gen_field(values->CubeFaceEnableNegativeZ, 1, 1) |
+      __gen_field(values->CubeFaceEnablePositiveY, 2, 2) |
+      __gen_field(values->CubeFaceEnableNegativeY, 3, 3) |
+      __gen_field(values->CubeFaceEnablePositiveX, 4, 4) |
+      __gen_field(values->CubeFaceEnableNegativeX, 5, 5) |
+      0;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN8_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   dw[1] =
+      __gen_field(dw_MemoryObjectControlState, 24, 30) |
+      __gen_field(values->BaseMipLevel * (1 << 1), 19, 23) |
+      __gen_field(values->SurfaceQPitch, 0, 14) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Height, 16, 29) |
+      __gen_field(values->Width, 0, 13) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Depth, 21, 31) |
+      __gen_field(values->SurfacePitch, 0, 17) |
+      0;
+
+   dw[4] =
+      __gen_field(values->RenderTargetAndSampleUnormRotation, 29, 30) |
+      __gen_field(values->MinimumArrayElement, 18, 28) |
+      __gen_field(values->RenderTargetViewExtent, 7, 17) |
+      __gen_field(values->MultisampledSurfaceStorageFormat, 6, 6) |
+      __gen_field(values->NumberofMultisamples, 3, 5) |
+      __gen_field(values->MultisamplePositionPaletteIndex, 0, 2) |
+      0;
+
+   dw[5] =
+      __gen_offset(values->XOffset, 25, 31) |
+      __gen_offset(values->YOffset, 21, 23) |
+      __gen_field(values->EWADisableForCube, 20, 20) |
+      __gen_field(values->CoherencyType, 14, 14) |
+      __gen_field(values->SurfaceMinLOD, 4, 7) |
+      __gen_field(values->MIPCountLOD, 0, 3) |
+      0;
+
+   dw[6] =
+      __gen_field(values->AuxiliarySurfaceQPitch, 16, 30) |
+      __gen_field(values->AuxiliarySurfacePitch, 3, 11) |
+      __gen_field(values->AuxiliarySurfaceMode, 0, 2) |
+      __gen_field(values->SeparateUVPlaneEnable, 31, 31) |
+      __gen_field(values->XOffsetforUorUVPlane, 16, 29) |
+      __gen_field(values->YOffsetforUorUVPlane, 0, 13) |
+      0;
+
+   dw[7] =
+      __gen_field(values->RedClearColor, 31, 31) |
+      __gen_field(values->GreenClearColor, 30, 30) |
+      __gen_field(values->BlueClearColor, 29, 29) |
+      __gen_field(values->AlphaClearColor, 28, 28) |
+      __gen_field(values->ShaderChannelSelectRed, 25, 27) |
+      __gen_field(values->ShaderChannelSelectGreen, 22, 24) |
+      __gen_field(values->ShaderChannelSelectBlue, 19, 21) |
+      __gen_field(values->ShaderChannelSelectAlpha, 16, 18) |
+      __gen_field(values->ResourceMinLOD * (1 << 8), 0, 11) |
+      0;
+
+   uint32_t dw8 =
+      0;
+
+   uint64_t qw8 =
+      __gen_combine_address(data, &dw[8], values->SurfaceBaseAddress, dw8);
+
+   dw[8] = qw8;
+   dw[9] = qw8 >> 32;
+
+   uint32_t dw10 =
+      __gen_field(values->XOffsetforVPlane, 48, 61) |
+      __gen_field(values->YOffsetforVPlane, 32, 45) |
+      __gen_field(values->AuxiliaryTableIndexforMediaCompressedSurface, 21, 31) |
+      0;
+
+   uint64_t qw10 =
+      __gen_combine_address(data, &dw[10], values->AuxiliarySurfaceBaseAddress, dw10);
+
+   dw[10] = qw10;
+   dw[11] = qw10 >> 32;
+
+   dw[12] =
+      0;
+
+   dw[13] =
+      0;
+
+   dw[14] =
+      0;
+
+   dw[15] =
+      0;
+
+}
+
+#define GEN8_FILTER_COEFFICIENT_length 0x00000001
+
+struct GEN8_FILTER_COEFFICIENT {
+   uint32_t                                     FilterCoefficient;
+};
+
+static inline void
+GEN8_FILTER_COEFFICIENT_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN8_FILTER_COEFFICIENT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->FilterCoefficient, 0, 7) |
+      0;
+
+}
+
+#define GEN8_SAMPLER_STATE_length 0x00000004
+
+struct GEN8_SAMPLER_STATE {
+   bool                                         SamplerDisable;
+#define     DX10OGL                                            0
+#define     DX9                                                1
+   uint32_t                                     TextureBorderColorMode;
+#define     CLAMP_NONE                                         0
+#define     CLAMP_OGL                                          2
+   uint32_t                                     LODPreClampMode;
+   float                                        BaseMipLevel;
+#define     MIPFILTER_NONE                                     0
+#define     MIPFILTER_NEAREST                                  1
+#define     MIPFILTER_LINEAR                                   3
+   uint32_t                                     MipModeFilter;
+#define     MAPFILTER_NEAREST                                  0
+#define     MAPFILTER_LINEAR                                   1
+#define     MAPFILTER_ANISOTROPIC                              2
+#define     MAPFILTER_MONO                                     6
+   uint32_t                                     MagModeFilter;
+#define     MAPFILTER_NEAREST                                  0
+#define     MAPFILTER_LINEAR                                   1
+#define     MAPFILTER_ANISOTROPIC                              2
+#define     MAPFILTER_MONO                                     6
+   uint32_t                                     MinModeFilter;
+   float                                        TextureLODBias;
+#define     LEGACY                                             0
+#define     EWAApproximation                                   1
+   uint32_t                                     AnisotropicAlgorithm;
+   float                                        MinLOD;
+   float                                        MaxLOD;
+   bool                                         ChromaKeyEnable;
+   uint32_t                                     ChromaKeyIndex;
+#define     KEYFILTER_KILL_ON_ANY_MATCH                        0
+#define     KEYFILTER_REPLACE_BLACK                            1
+   uint32_t                                     ChromaKeyMode;
+#define     PREFILTEROPALWAYS                                  0
+#define     PREFILTEROPNEVER                                   1
+#define     PREFILTEROPLESS                                    2
+#define     PREFILTEROPEQUAL                                   3
+#define     PREFILTEROPLEQUAL                                  4
+#define     PREFILTEROPGREATER                                 5
+#define     PREFILTEROPNOTEQUAL                                6
+#define     PREFILTEROPGEQUAL                                  7
+   uint32_t                                     ShadowFunction;
+#define     PROGRAMMED                                         0
+#define     OVERRIDE                                           1
+   uint32_t                                     CubeSurfaceControlMode;
+   uint32_t                                     IndirectStatePointer;
+#define     MIPNONE                                            0
+#define     MIPFILTER                                          1
+   uint32_t                                     LODClampMagnificationMode;
+#define     RATIO21                                            0
+#define     RATIO41                                            1
+#define     RATIO61                                            2
+#define     RATIO81                                            3
+#define     RATIO101                                           4
+#define     RATIO121                                           5
+#define     RATIO141                                           6
+#define     RATIO161                                           7
+   uint32_t                                     MaximumAnisotropy;
+   bool                                         RAddressMinFilterRoundingEnable;
+   bool                                         RAddressMagFilterRoundingEnable;
+   bool                                         VAddressMinFilterRoundingEnable;
+   bool                                         VAddressMagFilterRoundingEnable;
+   bool                                         UAddressMinFilterRoundingEnable;
+   bool                                         UAddressMagFilterRoundingEnable;
+#define     FULL                                               0
+#define     HIGH                                               1
+#define     MED                                                2
+#define     LOW                                                3
+   uint32_t                                     TrilinearFilterQuality;
+   bool                                         NonnormalizedCoordinateEnable;
+   uint32_t                                     TCXAddressControlMode;
+   uint32_t                                     TCYAddressControlMode;
+   uint32_t                                     TCZAddressControlMode;
+};
+
+static inline void
+GEN8_SAMPLER_STATE_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN8_SAMPLER_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->SamplerDisable, 31, 31) |
+      __gen_field(values->TextureBorderColorMode, 29, 29) |
+      __gen_field(values->LODPreClampMode, 27, 28) |
+      __gen_field(values->BaseMipLevel * (1 << 1), 22, 26) |
+      __gen_field(values->MipModeFilter, 20, 21) |
+      __gen_field(values->MagModeFilter, 17, 19) |
+      __gen_field(values->MinModeFilter, 14, 16) |
+      __gen_fixed(values->TextureLODBias, 1, 13, true, 8) |
+      __gen_field(values->AnisotropicAlgorithm, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MinLOD * (1 << 8), 20, 31) |
+      __gen_field(values->MaxLOD * (1 << 8), 8, 19) |
+      __gen_field(values->ChromaKeyEnable, 7, 7) |
+      __gen_field(values->ChromaKeyIndex, 5, 6) |
+      __gen_field(values->ChromaKeyMode, 4, 4) |
+      __gen_field(values->ShadowFunction, 1, 3) |
+      __gen_field(values->CubeSurfaceControlMode, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_field(values->IndirectStatePointer, 6, 23) |
+      __gen_field(values->LODClampMagnificationMode, 0, 0) |
+      0;
+
+   dw[3] =
+      __gen_field(values->MaximumAnisotropy, 19, 21) |
+      __gen_field(values->RAddressMinFilterRoundingEnable, 13, 13) |
+      __gen_field(values->RAddressMagFilterRoundingEnable, 14, 14) |
+      __gen_field(values->VAddressMinFilterRoundingEnable, 15, 15) |
+      __gen_field(values->VAddressMagFilterRoundingEnable, 16, 16) |
+      __gen_field(values->UAddressMinFilterRoundingEnable, 17, 17) |
+      __gen_field(values->UAddressMagFilterRoundingEnable, 18, 18) |
+      __gen_field(values->TrilinearFilterQuality, 11, 12) |
+      __gen_field(values->NonnormalizedCoordinateEnable, 10, 10) |
+      __gen_field(values->TCXAddressControlMode, 6, 8) |
+      __gen_field(values->TCYAddressControlMode, 3, 5) |
+      __gen_field(values->TCZAddressControlMode, 0, 2) |
+      0;
+
+}
+
+#define GEN8_SAMPLER_STATE_8X8_AVS_COEFFICIENTS_length 0x00000008
+
+struct GEN8_SAMPLER_STATE_8X8_AVS_COEFFICIENTS {
+   uint32_t                                     Table0YFilterCoefficientn1;
+   uint32_t                                     Table0XFilterCoefficientn1;
+   uint32_t                                     Table0YFilterCoefficientn0;
+   uint32_t                                     Table0XFilterCoefficientn0;
+   uint32_t                                     Table0YFilterCoefficientn3;
+   uint32_t                                     Table0XFilterCoefficientn3;
+   uint32_t                                     Table0YFilterCoefficientn2;
+   uint32_t                                     Table0XFilterCoefficientn2;
+   uint32_t                                     Table0YFilterCoefficientn5;
+   uint32_t                                     Table0XFilterCoefficientn5;
+   uint32_t                                     Table0YFilterCoefficientn4;
+   uint32_t                                     Table0XFilterCoefficientn4;
+   uint32_t                                     Table0YFilterCoefficientn7;
+   uint32_t                                     Table0XFilterCoefficientn7;
+   uint32_t                                     Table0YFilterCoefficientn6;
+   uint32_t                                     Table0XFilterCoefficientn6;
+   uint32_t                                     Table1XFilterCoefficientn3;
+   uint32_t                                     Table1XFilterCoefficientn2;
+   uint32_t                                     Table1XFilterCoefficientn5;
+   uint32_t                                     Table1XFilterCoefficientn4;
+   uint32_t                                     Table1YFilterCoefficientn3;
+   uint32_t                                     Table1YFilterCoefficientn2;
+   uint32_t                                     Table1YFilterCoefficientn5;
+   uint32_t                                     Table1YFilterCoefficientn4;
+};
+
+static inline void
+GEN8_SAMPLER_STATE_8X8_AVS_COEFFICIENTS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN8_SAMPLER_STATE_8X8_AVS_COEFFICIENTS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->Table0YFilterCoefficientn1, 24, 31) |
+      __gen_field(values->Table0XFilterCoefficientn1, 16, 23) |
+      __gen_field(values->Table0YFilterCoefficientn0, 8, 15) |
+      __gen_field(values->Table0XFilterCoefficientn0, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Table0YFilterCoefficientn3, 24, 31) |
+      __gen_field(values->Table0XFilterCoefficientn3, 16, 23) |
+      __gen_field(values->Table0YFilterCoefficientn2, 8, 15) |
+      __gen_field(values->Table0XFilterCoefficientn2, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Table0YFilterCoefficientn5, 24, 31) |
+      __gen_field(values->Table0XFilterCoefficientn5, 16, 23) |
+      __gen_field(values->Table0YFilterCoefficientn4, 8, 15) |
+      __gen_field(values->Table0XFilterCoefficientn4, 0, 7) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Table0YFilterCoefficientn7, 24, 31) |
+      __gen_field(values->Table0XFilterCoefficientn7, 16, 23) |
+      __gen_field(values->Table0YFilterCoefficientn6, 8, 15) |
+      __gen_field(values->Table0XFilterCoefficientn6, 0, 7) |
+      0;
+
+   dw[4] =
+      __gen_field(values->Table1XFilterCoefficientn3, 24, 31) |
+      __gen_field(values->Table1XFilterCoefficientn2, 16, 23) |
+      0;
+
+   dw[5] =
+      __gen_field(values->Table1XFilterCoefficientn5, 8, 15) |
+      __gen_field(values->Table1XFilterCoefficientn4, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->Table1YFilterCoefficientn3, 24, 31) |
+      __gen_field(values->Table1YFilterCoefficientn2, 16, 23) |
+      0;
+
+   dw[7] =
+      __gen_field(values->Table1YFilterCoefficientn5, 8, 15) |
+      __gen_field(values->Table1YFilterCoefficientn4, 0, 7) |
+      0;
+
+}
+
+/* Enum 3D_Prim_Topo_Type */
+#define     _3DPRIM_POINTLIST                                  1
+#define     _3DPRIM_LINELIST                                   2
+#define     _3DPRIM_LINESTRIP                                  3
+#define     _3DPRIM_TRILIST                                    4
+#define     _3DPRIM_TRISTRIP                                   5
+#define     _3DPRIM_TRIFAN                                     6
+#define     _3DPRIM_QUADLIST                                   7
+#define     _3DPRIM_QUADSTRIP                                  8
+#define     _3DPRIM_LINELIST_ADJ                               9
+#define     _3DPRIM_LINESTRIP_ADJ                             10
+#define     _3DPRIM_TRILIST_ADJ                               11
+#define     _3DPRIM_TRISTRIP_ADJ                              12
+#define     _3DPRIM_TRISTRIP_REVERSE                          13
+#define     _3DPRIM_POLYGON                                   14
+#define     _3DPRIM_RECTLIST                                  15
+#define     _3DPRIM_LINELOOP                                  16
+#define     _3DPRIM_POINTLIST_BF                              17
+#define     _3DPRIM_LINESTRIP_CONT                            18
+#define     _3DPRIM_LINESTRIP_BF                              19
+#define     _3DPRIM_LINESTRIP_CONT_BF                         20
+#define     _3DPRIM_TRIFAN_NOSTIPPLE                          22
+#define     _3DPRIM_PATCHLIST_1                               32
+#define     _3DPRIM_PATCHLIST_2                               33
+#define     _3DPRIM_PATCHLIST_3                               34
+#define     _3DPRIM_PATCHLIST_4                               35
+#define     _3DPRIM_PATCHLIST_5                               36
+#define     _3DPRIM_PATCHLIST_6                               37
+#define     _3DPRIM_PATCHLIST_7                               38
+#define     _3DPRIM_PATCHLIST_8                               39
+#define     _3DPRIM_PATCHLIST_9                               40
+#define     _3DPRIM_PATCHLIST_10                              41
+#define     _3DPRIM_PATCHLIST_11                              42
+#define     _3DPRIM_PATCHLIST_12                              43
+#define     _3DPRIM_PATCHLIST_13                              44
+#define     _3DPRIM_PATCHLIST_14                              45
+#define     _3DPRIM_PATCHLIST_15                              46
+#define     _3DPRIM_PATCHLIST_16                              47
+#define     _3DPRIM_PATCHLIST_17                              48
+#define     _3DPRIM_PATCHLIST_18                              49
+#define     _3DPRIM_PATCHLIST_19                              50
+#define     _3DPRIM_PATCHLIST_20                              51
+#define     _3DPRIM_PATCHLIST_21                              52
+#define     _3DPRIM_PATCHLIST_22                              53
+#define     _3DPRIM_PATCHLIST_23                              54
+#define     _3DPRIM_PATCHLIST_24                              55
+#define     _3DPRIM_PATCHLIST_25                              56
+#define     _3DPRIM_PATCHLIST_26                              57
+#define     _3DPRIM_PATCHLIST_27                              58
+#define     _3DPRIM_PATCHLIST_28                              59
+#define     _3DPRIM_PATCHLIST_29                              60
+#define     _3DPRIM_PATCHLIST_30                              61
+#define     _3DPRIM_PATCHLIST_31                              62
+#define     _3DPRIM_PATCHLIST_32                              63
+
+/* Enum 3D_Vertex_Component_Control */
+#define     VFCOMP_NOSTORE                                     0
+#define     VFCOMP_STORE_SRC                                   1
+#define     VFCOMP_STORE_0                                     2
+#define     VFCOMP_STORE_1_FP                                  3
+#define     VFCOMP_STORE_1_INT                                 4
+#define     VFCOMP_STORE_PID                                   7
+
+/* Enum WRAP_SHORTEST_ENABLE */
+#define     WSE_X                                              1
+#define     WSE_Y                                              2
+#define     WSE_XY                                             3
+#define     WSE_Z                                              4
+#define     WSE_XZ                                             5
+#define     WSE_YZ                                             6
+#define     WSE_XYZ                                            7
+#define     WSE_W                                              8
+#define     WSE_XW                                             9
+#define     WSE_YW                                            10
+#define     WSE_XYW                                           11
+#define     WSE_ZW                                            12
+#define     WSE_XZW                                           13
+#define     WSE_YZW                                           14
+#define     WSE_XYZW                                          15
+
+/* Enum 3D_Stencil_Operation */
+#define     STENCILOP_KEEP                                     0
+#define     STENCILOP_ZERO                                     1
+#define     STENCILOP_REPLACE                                  2
+#define     STENCILOP_INCRSAT                                  3
+#define     STENCILOP_DECRSAT                                  4
+#define     STENCILOP_INCR                                     5
+#define     STENCILOP_DECR                                     6
+#define     STENCILOP_INVERT                                   7
+
+/* Enum 3D_Color_Buffer_Blend_Factor */
+#define     BLENDFACTOR_ONE                                    1
+#define     BLENDFACTOR_SRC_COLOR                              2
+#define     BLENDFACTOR_SRC_ALPHA                              3
+#define     BLENDFACTOR_DST_ALPHA                              4
+#define     BLENDFACTOR_DST_COLOR                              5
+#define     BLENDFACTOR_SRC_ALPHA_SATURATE                     6
+#define     BLENDFACTOR_CONST_COLOR                            7
+#define     BLENDFACTOR_CONST_ALPHA                            8
+#define     BLENDFACTOR_SRC1_COLOR                             9
+#define     BLENDFACTOR_SRC1_ALPHA                            10
+#define     BLENDFACTOR_ZERO                                  17
+#define     BLENDFACTOR_INV_SRC_COLOR                         18
+#define     BLENDFACTOR_INV_SRC_ALPHA                         19
+#define     BLENDFACTOR_INV_DST_ALPHA                         20
+#define     BLENDFACTOR_INV_DST_COLOR                         21
+#define     BLENDFACTOR_INV_CONST_COLOR                       23
+#define     BLENDFACTOR_INV_CONST_ALPHA                       24
+#define     BLENDFACTOR_INV_SRC1_COLOR                        25
+#define     BLENDFACTOR_INV_SRC1_ALPHA                        26
+
+/* Enum 3D_Color_Buffer_Blend_Function */
+#define     BLENDFUNCTION_ADD                                  0
+#define     BLENDFUNCTION_SUBTRACT                             1
+#define     BLENDFUNCTION_REVERSE_SUBTRACT                     2
+#define     BLENDFUNCTION_MIN                                  3
+#define     BLENDFUNCTION_MAX                                  4
+
+/* Enum 3D_Compare_Function */
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+
+/* Enum 3D_Logic_Op_Function */
+#define     LOGICOP_CLEAR                                      0
+#define     LOGICOP_NOR                                        1
+#define     LOGICOP_AND_INVERTED                               2
+#define     LOGICOP_COPY_INVERTED                              3
+#define     LOGICOP_AND_REVERSE                                4
+#define     LOGICOP_INVERT                                     5
+#define     LOGICOP_XOR                                        6
+#define     LOGICOP_NAND                                       7
+#define     LOGICOP_AND                                        8
+#define     LOGICOP_EQUIV                                      9
+#define     LOGICOP_NOOP                                      10
+#define     LOGICOP_OR_INVERTED                               11
+#define     LOGICOP_COPY                                      12
+#define     LOGICOP_OR_REVERSE                                13
+#define     LOGICOP_OR                                        14
+#define     LOGICOP_SET                                       15
+
+/* Enum SURFACE_FORMAT */
+#define     R32G32B32A32_FLOAT                                 0
+#define     R32G32B32A32_SINT                                  1
+#define     R32G32B32A32_UINT                                  2
+#define     R32G32B32A32_UNORM                                 3
+#define     R32G32B32A32_SNORM                                 4
+#define     R64G64_FLOAT                                       5
+#define     R32G32B32X32_FLOAT                                 6
+#define     R32G32B32A32_SSCALED                               7
+#define     R32G32B32A32_USCALED                               8
+#define     R32G32B32A32_SFIXED                               32
+#define     R64G64_PASSTHRU                                   33
+#define     R32G32B32_FLOAT                                   64
+#define     R32G32B32_SINT                                    65
+#define     R32G32B32_UINT                                    66
+#define     R32G32B32_UNORM                                   67
+#define     R32G32B32_SNORM                                   68
+#define     R32G32B32_SSCALED                                 69
+#define     R32G32B32_USCALED                                 70
+#define     R32G32B32_SFIXED                                  80
+#define     R16G16B16A16_UNORM                               128
+#define     R16G16B16A16_SNORM                               129
+#define     R16G16B16A16_SINT                                130
+#define     R16G16B16A16_UINT                                131
+#define     R16G16B16A16_FLOAT                               132
+#define     R32G32_FLOAT                                     133
+#define     R32G32_SINT                                      134
+#define     R32G32_UINT                                      135
+#define     R32_FLOAT_X8X24_TYPELESS                         136
+#define     X32_TYPELESS_G8X24_UINT                          137
+#define     L32A32_FLOAT                                     138
+#define     R32G32_UNORM                                     139
+#define     R32G32_SNORM                                     140
+#define     R64_FLOAT                                        141
+#define     R16G16B16X16_UNORM                               142
+#define     R16G16B16X16_FLOAT                               143
+#define     A32X32_FLOAT                                     144
+#define     L32X32_FLOAT                                     145
+#define     I32X32_FLOAT                                     146
+#define     R16G16B16A16_SSCALED                             147
+#define     R16G16B16A16_USCALED                             148
+#define     R32G32_SSCALED                                   149
+#define     R32G32_USCALED                                   150
+#define     R32G32_SFIXED                                    160
+#define     R64_PASSTHRU                                     161
+#define     B8G8R8A8_UNORM                                   192
+#define     B8G8R8A8_UNORM_SRGB                              193
+#define     R10G10B10A2_UNORM                                194
+#define     R10G10B10A2_UNORM_SRGB                           195
+#define     R10G10B10A2_UINT                                 196
+#define     R10G10B10_SNORM_A2_UNORM                         197
+#define     R8G8B8A8_UNORM                                   199
+#define     R8G8B8A8_UNORM_SRGB                              200
+#define     R8G8B8A8_SNORM                                   201
+#define     R8G8B8A8_SINT                                    202
+#define     R8G8B8A8_UINT                                    203
+#define     R16G16_UNORM                                     204
+#define     R16G16_SNORM                                     205
+#define     R16G16_SINT                                      206
+#define     R16G16_UINT                                      207
+#define     R16G16_FLOAT                                     208
+#define     B10G10R10A2_UNORM                                209
+#define     B10G10R10A2_UNORM_SRGB                           210
+#define     R11G11B10_FLOAT                                  211
+#define     R32_SINT                                         214
+#define     R32_UINT                                         215
+#define     R32_FLOAT                                        216
+#define     R24_UNORM_X8_TYPELESS                            217
+#define     X24_TYPELESS_G8_UINT                             218
+#define     L32_UNORM                                        221
+#define     A32_UNORM                                        222
+#define     L16A16_UNORM                                     223
+#define     I24X8_UNORM                                      224
+#define     L24X8_UNORM                                      225
+#define     A24X8_UNORM                                      226
+#define     I32_FLOAT                                        227
+#define     L32_FLOAT                                        228
+#define     A32_FLOAT                                        229
+#define     X8B8_UNORM_G8R8_SNORM                            230
+#define     A8X8_UNORM_G8R8_SNORM                            231
+#define     B8X8_UNORM_G8R8_SNORM                            232
+#define     B8G8R8X8_UNORM                                   233
+#define     B8G8R8X8_UNORM_SRGB                              234
+#define     R8G8B8X8_UNORM                                   235
+#define     R8G8B8X8_UNORM_SRGB                              236
+#define     R9G9B9E5_SHAREDEXP                               237
+#define     B10G10R10X2_UNORM                                238
+#define     L16A16_FLOAT                                     240
+#define     R32_UNORM                                        241
+#define     R32_SNORM                                        242
+#define     R10G10B10X2_USCALED                              243
+#define     R8G8B8A8_SSCALED                                 244
+#define     R8G8B8A8_USCALED                                 245
+#define     R16G16_SSCALED                                   246
+#define     R16G16_USCALED                                   247
+#define     R32_SSCALED                                      248
+#define     R32_USCALED                                      249
+#define     B5G6R5_UNORM                                     256
+#define     B5G6R5_UNORM_SRGB                                257
+#define     B5G5R5A1_UNORM                                   258
+#define     B5G5R5A1_UNORM_SRGB                              259
+#define     B4G4R4A4_UNORM                                   260
+#define     B4G4R4A4_UNORM_SRGB                              261
+#define     R8G8_UNORM                                       262
+#define     R8G8_SNORM                                       263
+#define     R8G8_SINT                                        264
+#define     R8G8_UINT                                        265
+#define     R16_UNORM                                        266
+#define     R16_SNORM                                        267
+#define     R16_SINT                                         268
+#define     R16_UINT                                         269
+#define     R16_FLOAT                                        270
+#define     A8P8_UNORM_PALETTE0                              271
+#define     A8P8_UNORM_PALETTE1                              272
+#define     I16_UNORM                                        273
+#define     L16_UNORM                                        274
+#define     A16_UNORM                                        275
+#define     L8A8_UNORM                                       276
+#define     I16_FLOAT                                        277
+#define     L16_FLOAT                                        278
+#define     A16_FLOAT                                        279
+#define     L8A8_UNORM_SRGB                                  280
+#define     R5G5_SNORM_B6_UNORM                              281
+#define     B5G5R5X1_UNORM                                   282
+#define     B5G5R5X1_UNORM_SRGB                              283
+#define     R8G8_SSCALED                                     284
+#define     R8G8_USCALED                                     285
+#define     R16_SSCALED                                      286
+#define     R16_USCALED                                      287
+#define     P8A8_UNORM_PALETTE0                              290
+#define     P8A8_UNORM_PALETTE1                              291
+#define     A1B5G5R5_UNORM                                   292
+#define     A4B4G4R4_UNORM                                   293
+#define     L8A8_UINT                                        294
+#define     L8A8_SINT                                        295
+#define     R8_UNORM                                         320
+#define     R8_SNORM                                         321
+#define     R8_SINT                                          322
+#define     R8_UINT                                          323
+#define     A8_UNORM                                         324
+#define     I8_UNORM                                         325
+#define     L8_UNORM                                         326
+#define     P4A4_UNORM_PALETTE0                              327
+#define     A4P4_UNORM_PALETTE0                              328
+#define     R8_SSCALED                                       329
+#define     R8_USCALED                                       330
+#define     P8_UNORM_PALETTE0                                331
+#define     L8_UNORM_SRGB                                    332
+#define     P8_UNORM_PALETTE1                                333
+#define     P4A4_UNORM_PALETTE1                              334
+#define     A4P4_UNORM_PALETTE1                              335
+#define     Y8_UNORM                                         336
+#define     L8_UINT                                          338
+#define     L8_SINT                                          339
+#define     I8_UINT                                          340
+#define     I8_SINT                                          341
+#define     DXT1_RGB_SRGB                                    384
+#define     R1_UNORM                                         385
+#define     YCRCB_NORMAL                                     386
+#define     YCRCB_SWAPUVY                                    387
+#define     P2_UNORM_PALETTE0                                388
+#define     P2_UNORM_PALETTE1                                389
+#define     BC1_UNORM                                        390
+#define     BC2_UNORM                                        391
+#define     BC3_UNORM                                        392
+#define     BC4_UNORM                                        393
+#define     BC5_UNORM                                        394
+#define     BC1_UNORM_SRGB                                   395
+#define     BC2_UNORM_SRGB                                   396
+#define     BC3_UNORM_SRGB                                   397
+#define     MONO8                                            398
+#define     YCRCB_SWAPUV                                     399
+#define     YCRCB_SWAPY                                      400
+#define     DXT1_RGB                                         401
+#define     FXT1                                             402
+#define     R8G8B8_UNORM                                     403
+#define     R8G8B8_SNORM                                     404
+#define     R8G8B8_SSCALED                                   405
+#define     R8G8B8_USCALED                                   406
+#define     R64G64B64A64_FLOAT                               407
+#define     R64G64B64_FLOAT                                  408
+#define     BC4_SNORM                                        409
+#define     BC5_SNORM                                        410
+#define     R16G16B16_FLOAT                                  411
+#define     R16G16B16_UNORM                                  412
+#define     R16G16B16_SNORM                                  413
+#define     R16G16B16_SSCALED                                414
+#define     R16G16B16_USCALED                                415
+#define     BC6H_SF16                                        417
+#define     BC7_UNORM                                        418
+#define     BC7_UNORM_SRGB                                   419
+#define     BC6H_UF16                                        420
+#define     PLANAR_420_8                                     421
+#define     R8G8B8_UNORM_SRGB                                424
+#define     ETC1_RGB8                                        425
+#define     ETC2_RGB8                                        426
+#define     EAC_R11                                          427
+#define     EAC_RG11                                         428
+#define     EAC_SIGNED_R11                                   429
+#define     EAC_SIGNED_RG11                                  430
+#define     ETC2_SRGB8                                       431
+#define     R16G16B16_UINT                                   432
+#define     R16G16B16_SINT                                   433
+#define     R32_SFIXED                                       434
+#define     R10G10B10A2_SNORM                                435
+#define     R10G10B10A2_USCALED                              436
+#define     R10G10B10A2_SSCALED                              437
+#define     R10G10B10A2_SINT                                 438
+#define     B10G10R10A2_SNORM                                439
+#define     B10G10R10A2_USCALED                              440
+#define     B10G10R10A2_SSCALED                              441
+#define     B10G10R10A2_UINT                                 442
+#define     B10G10R10A2_SINT                                 443
+#define     R64G64B64A64_PASSTHRU                            444
+#define     R64G64B64_PASSTHRU                               445
+#define     ETC2_RGB8_PTA                                    448
+#define     ETC2_SRGB8_PTA                                   449
+#define     ETC2_EAC_RGBA8                                   450
+#define     ETC2_EAC_SRGB8_A8                                451
+#define     R8G8B8_UINT                                      456
+#define     R8G8B8_SINT                                      457
+#define     RAW                                              511
+
+/* Enum Shader Channel Select */
+#define     SCS_ZERO                                           0
+#define     SCS_ONE                                            1
+#define     SCS_RED                                            4
+#define     SCS_GREEN                                          5
+#define     SCS_BLUE                                           6
+#define     SCS_ALPHA                                          7
+
+/* Enum Clear Color */
+#define     CC_ZERO                                            0
+#define     CC_ONE                                             1
+
+/* Enum Texture Coordinate Mode */
+#define     TCM_WRAP                                           0
+#define     TCM_MIRROR                                         1
+#define     TCM_CLAMP                                          2
+#define     TCM_CUBE                                           3
+#define     TCM_CLAMP_BORDER                                   4
+#define     TCM_MIRROR_ONCE                                    5
+#define     TCM_HALF_BORDER                                    6
+
diff --git a/src/vulkan/gen8_pipeline.c b/src/vulkan/gen8_pipeline.c

new file mode 100644 (file)

index 0000000..2be71a0
--- /dev/null
+++ b/src/vulkan/gen8_pipeline.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "gen8_pack.h"
+#include "gen9_pack.h"
+
+#include "genX_pipeline_util.h"
+
+static void
+emit_vertex_input(struct anv_pipeline *pipeline,
+                  const VkPipelineVertexInputStateCreateInfo *info,
+                  const struct anv_graphics_pipeline_create_info *extra)
+{
+   static_assert(ANV_GEN >= 8, "should be compiling this for gen < 8");
+
+   uint32_t elements;
+   if (extra && extra->disable_vs) {
+      /* If the VS is disabled, just assume the user knows what they're
+       * doing and apply the layout blindly.  This can only come from
+       * meta, so this *should* be safe.
+       */
+      elements = 0;
+      for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++)
+         elements |= (1 << info->pVertexAttributeDescriptions[i].location);
+   } else {
+      /* Pull inputs_read out of the VS prog data */
+      uint64_t inputs_read = pipeline->vs_prog_data.inputs_read;
+      assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
+      elements = inputs_read >> VERT_ATTRIB_GENERIC0;
+   }
+
+   const uint32_t num_dwords = 1 + __builtin_popcount(elements) * 2;
+
+   uint32_t *p;
+   if (elements != 0) {
+      p = anv_batch_emitn(&pipeline->batch, num_dwords,
+                          GENX(3DSTATE_VERTEX_ELEMENTS));
+      memset(p + 1, 0, (num_dwords - 1) * 4);
+   }
+
+   for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
+      const VkVertexInputAttributeDescription *desc =
+         &info->pVertexAttributeDescriptions[i];
+      enum isl_format format = anv_get_isl_format(desc->format,
+                                                  VK_IMAGE_ASPECT_COLOR_BIT,
+                                                  VK_IMAGE_TILING_LINEAR);
+
+      assert(desc->binding < 32);
+
+      if ((elements & (1 << desc->location)) == 0)
+         continue; /* Binding unused */
+
+      uint32_t slot = __builtin_popcount(elements & ((1 << desc->location) - 1));
+
+      struct GENX(VERTEX_ELEMENT_STATE) element = {
+         .VertexBufferIndex = desc->binding,
+         .Valid = true,
+         .SourceElementFormat = format,
+         .EdgeFlagEnable = false,
+         .SourceElementOffset = desc->offset,
+         .Component0Control = vertex_element_comp_control(format, 0),
+         .Component1Control = vertex_element_comp_control(format, 1),
+         .Component2Control = vertex_element_comp_control(format, 2),
+         .Component3Control = vertex_element_comp_control(format, 3),
+      };
+      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
+
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING),
+                     .InstancingEnable = pipeline->instancing_enable[desc->binding],
+                     .VertexElementIndex = slot,
+                     /* Vulkan so far doesn't have an instance divisor, so
+                      * this is always 1 (ignored if not instancing). */
+                     .InstanceDataStepRate = 1);
+   }
+
+   const uint32_t id_slot = __builtin_popcount(elements);
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_SGVS),
+                  .VertexIDEnable = pipeline->vs_prog_data.uses_vertexid,
+                  .VertexIDComponentNumber = 2,
+                  .VertexIDElementOffset = id_slot,
+                  .InstanceIDEnable = pipeline->vs_prog_data.uses_instanceid,
+                  .InstanceIDComponentNumber = 3,
+                  .InstanceIDElementOffset = id_slot);
+}
+
+static void
+emit_ia_state(struct anv_pipeline *pipeline,
+              const VkPipelineInputAssemblyStateCreateInfo *info,
+              const struct anv_graphics_pipeline_create_info *extra)
+{
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_TOPOLOGY),
+                  .PrimitiveTopologyType = pipeline->topology);
+}
+
+static void
+emit_rs_state(struct anv_pipeline *pipeline,
+              const VkPipelineRasterizationStateCreateInfo *info,
+              const struct anv_graphics_pipeline_create_info *extra)
+{
+   struct GENX(3DSTATE_SF) sf = {
+      GENX(3DSTATE_SF_header),
+      .ViewportTransformEnable = !(extra && extra->disable_viewport),
+      .TriangleStripListProvokingVertexSelect = 0,
+      .LineStripListProvokingVertexSelect = 0,
+      .TriangleFanProvokingVertexSelect = 0,
+      .PointWidthSource = pipeline->writes_point_size ? Vertex : State,
+      .PointWidth = 1.0,
+   };
+
+   /* FINISHME: VkBool32 rasterizerDiscardEnable; */
+
+   GENX(3DSTATE_SF_pack)(NULL, pipeline->gen8.sf, &sf);
+
+   struct GENX(3DSTATE_RASTER) raster = {
+      GENX(3DSTATE_RASTER_header),
+      .FrontWinding = vk_to_gen_front_face[info->frontFace],
+      .CullMode = vk_to_gen_cullmode[info->cullMode],
+      .FrontFaceFillMode = vk_to_gen_fillmode[info->polygonMode],
+      .BackFaceFillMode = vk_to_gen_fillmode[info->polygonMode],
+      .ScissorRectangleEnable = !(extra && extra->disable_scissor),
+#if ANV_GEN == 8
+      .ViewportZClipTestEnable = true,
+#else
+      /* GEN9+ splits ViewportZClipTestEnable into near and far enable bits */
+      .ViewportZFarClipTestEnable = true,
+      .ViewportZNearClipTestEnable = true,
+#endif
+   };
+
+   GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gen8.raster, &raster);
+}
+
+static void
+emit_cb_state(struct anv_pipeline *pipeline,
+              const VkPipelineColorBlendStateCreateInfo *info,
+              const VkPipelineMultisampleStateCreateInfo *ms_info)
+{
+   struct anv_device *device = pipeline->device;
+
+   uint32_t num_dwords = GENX(BLEND_STATE_length);
+   pipeline->blend_state =
+      anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
+
+   struct GENX(BLEND_STATE) blend_state = {
+      .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
+      .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
+   };
+
+   for (uint32_t i = 0; i < info->attachmentCount; i++) {
+      const VkPipelineColorBlendAttachmentState *a = &info->pAttachments[i];
+
+      if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
+          a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
+          a->colorBlendOp != a->alphaBlendOp) {
+         blend_state.IndependentAlphaBlendEnable = true;
+      }
+
+      blend_state.Entry[i] = (struct GENX(BLEND_STATE_ENTRY)) {
+         .LogicOpEnable = info->logicOpEnable,
+         .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
+         .ColorBufferBlendEnable = a->blendEnable,
+         .PreBlendSourceOnlyClampEnable = false,
+         .ColorClampRange = COLORCLAMP_RTFORMAT,
+         .PreBlendColorClampEnable = true,
+         .PostBlendColorClampEnable = true,
+         .SourceBlendFactor = vk_to_gen_blend[a->srcColorBlendFactor],
+         .DestinationBlendFactor = vk_to_gen_blend[a->dstColorBlendFactor],
+         .ColorBlendFunction = vk_to_gen_blend_op[a->colorBlendOp],
+         .SourceAlphaBlendFactor = vk_to_gen_blend[a->srcAlphaBlendFactor],
+         .DestinationAlphaBlendFactor = vk_to_gen_blend[a->dstAlphaBlendFactor],
+         .AlphaBlendFunction = vk_to_gen_blend_op[a->alphaBlendOp],
+         .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
+         .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
+         .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
+         .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
+      };
+
+      /* Our hardware applies the blend factor prior to the blend function
+       * regardless of what function is used.  Technically, this means the
+       * hardware can do MORE than GL or Vulkan specify.  However, it also
+       * means that, for MIN and MAX, we have to stomp the blend factor to
+       * ONE to make it a no-op.
+       */
+      if (a->colorBlendOp == VK_BLEND_OP_MIN ||
+          a->colorBlendOp == VK_BLEND_OP_MAX) {
+         blend_state.Entry[i].SourceBlendFactor = BLENDFACTOR_ONE;
+         blend_state.Entry[i].DestinationBlendFactor = BLENDFACTOR_ONE;
+      }
+      if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
+          a->alphaBlendOp == VK_BLEND_OP_MAX) {
+         blend_state.Entry[i].SourceAlphaBlendFactor = BLENDFACTOR_ONE;
+         blend_state.Entry[i].DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
+      }
+   }
+
+   GENX(BLEND_STATE_pack)(NULL, pipeline->blend_state.map, &blend_state);
+   if (!device->info.has_llc)
+      anv_state_clflush(pipeline->blend_state);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_BLEND_STATE_POINTERS),
+                  .BlendStatePointer = pipeline->blend_state.offset,
+                  .BlendStatePointerValid = true);
+}
+
+static void
+emit_ds_state(struct anv_pipeline *pipeline,
+              const VkPipelineDepthStencilStateCreateInfo *info)
+{
+   uint32_t *dw = ANV_GEN == 8 ?
+      pipeline->gen8.wm_depth_stencil : pipeline->gen9.wm_depth_stencil;
+
+   if (info == NULL) {
+      /* We're going to OR this together with the dynamic state.  We need
+       * to make sure it's initialized to something useful.
+       */
+      memset(pipeline->gen8.wm_depth_stencil, 0,
+             sizeof(pipeline->gen8.wm_depth_stencil));
+      memset(pipeline->gen9.wm_depth_stencil, 0,
+             sizeof(pipeline->gen9.wm_depth_stencil));
+      return;
+   }
+
+   /* VkBool32 depthBoundsTestEnable; // optional (depth_bounds_test) */
+
+   struct GENX(3DSTATE_WM_DEPTH_STENCIL) wm_depth_stencil = {
+      .DepthTestEnable = info->depthTestEnable,
+      .DepthBufferWriteEnable = info->depthWriteEnable,
+      .DepthTestFunction = vk_to_gen_compare_op[info->depthCompareOp],
+      .DoubleSidedStencilEnable = true,
+
+      .StencilTestEnable = info->stencilTestEnable,
+      .StencilFailOp = vk_to_gen_stencil_op[info->front.failOp],
+      .StencilPassDepthPassOp = vk_to_gen_stencil_op[info->front.passOp],
+      .StencilPassDepthFailOp = vk_to_gen_stencil_op[info->front.depthFailOp],
+      .StencilTestFunction = vk_to_gen_compare_op[info->front.compareOp],
+      .BackfaceStencilFailOp = vk_to_gen_stencil_op[info->back.failOp],
+      .BackfaceStencilPassDepthPassOp = vk_to_gen_stencil_op[info->back.passOp],
+      .BackfaceStencilPassDepthFailOp =vk_to_gen_stencil_op[info->back.depthFailOp],
+      .BackfaceStencilTestFunction = vk_to_gen_compare_op[info->back.compareOp],
+   };
+
+   GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dw, &wm_depth_stencil);
+}
+
+VkResult
+genX(graphics_pipeline_create)(
+    VkDevice                                    _device,
+    struct anv_pipeline_cache *                 cache,
+    const VkGraphicsPipelineCreateInfo*         pCreateInfo,
+    const struct anv_graphics_pipeline_create_info *extra,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipeline)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_pipeline *pipeline;
+   VkResult result;
+   uint32_t offset, length;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
+
+   pipeline = anv_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_pipeline_init(pipeline, device, cache,
+                              pCreateInfo, extra, pAllocator);
+   if (result != VK_SUCCESS) {
+      anv_free2(&device->alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   assert(pCreateInfo->pVertexInputState);
+   emit_vertex_input(pipeline, pCreateInfo->pVertexInputState, extra);
+   assert(pCreateInfo->pInputAssemblyState);
+   emit_ia_state(pipeline, pCreateInfo->pInputAssemblyState, extra);
+   assert(pCreateInfo->pRasterizationState);
+   emit_rs_state(pipeline, pCreateInfo->pRasterizationState, extra);
+   emit_ds_state(pipeline, pCreateInfo->pDepthStencilState);
+   emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
+                           pCreateInfo->pMultisampleState);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_STATISTICS),
+                   .StatisticsEnable = true);
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_HS), .Enable = false);
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_TE), .TEEnable = false);
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_DS), .FunctionEnable = false);
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_STREAMOUT), .SOFunctionEnable = false);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS),
+                  .ConstantBufferOffset = 0,
+                  .ConstantBufferSize = 4);
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_GS),
+                  .ConstantBufferOffset = 4,
+                  .ConstantBufferSize = 4);
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS),
+                  .ConstantBufferOffset = 8,
+                  .ConstantBufferSize = 4);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_WM_CHROMAKEY),
+                  .ChromaKeyKillEnable = false);
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_AA_LINE_PARAMETERS));
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_CLIP),
+                  .ClipEnable = true,
+                  .ViewportXYClipTestEnable = !(extra && extra->disable_viewport),
+                  .MinimumPointWidth = 0.125,
+                  .MaximumPointWidth = 255.875);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_WM),
+                  .StatisticsEnable = true,
+                  .LineEndCapAntialiasingRegionWidth = _05pixels,
+                  .LineAntialiasingRegionWidth = _10pixels,
+                  .EarlyDepthStencilControl = NORMAL,
+                  .ForceThreadDispatchEnable = NORMAL,
+                  .PointRasterizationRule = RASTRULE_UPPER_RIGHT,
+                  .BarycentricInterpolationMode =
+                     pipeline->wm_prog_data.barycentric_interp_modes);
+
+   uint32_t samples = 1;
+   uint32_t log2_samples = __builtin_ffs(samples) - 1;
+   bool enable_sampling = samples > 1 ? true : false;
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_MULTISAMPLE),
+                  .PixelPositionOffsetEnable = enable_sampling,
+                  .PixelLocation = CENTER,
+                  .NumberofMultisamples = log2_samples);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_SAMPLE_MASK),
+                  .SampleMask = 0xffff);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_URB_VS),
+                  .VSURBStartingAddress = pipeline->urb.vs_start,
+                  .VSURBEntryAllocationSize = pipeline->urb.vs_size - 1,
+                  .VSNumberofURBEntries = pipeline->urb.nr_vs_entries);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_URB_GS),
+                  .GSURBStartingAddress = pipeline->urb.gs_start,
+                  .GSURBEntryAllocationSize = pipeline->urb.gs_size - 1,
+                  .GSNumberofURBEntries = pipeline->urb.nr_gs_entries);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_URB_HS),
+                  .HSURBStartingAddress = pipeline->urb.vs_start,
+                  .HSURBEntryAllocationSize = 0,
+                  .HSNumberofURBEntries = 0);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_URB_DS),
+                  .DSURBStartingAddress = pipeline->urb.vs_start,
+                  .DSURBEntryAllocationSize = 0,
+                  .DSNumberofURBEntries = 0);
+
+   const struct brw_gs_prog_data *gs_prog_data = &pipeline->gs_prog_data;
+   offset = 1;
+   length = (gs_prog_data->base.vue_map.num_slots + 1) / 2 - offset;
+
+   if (pipeline->gs_kernel == NO_KERNEL)
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS), .Enable = false);
+   else
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS),
+                     .SingleProgramFlow = false,
+                     .KernelStartPointer = pipeline->gs_kernel,
+                     .VectorMaskEnable = Dmask,
+                     .SamplerCount = 0,
+                     .BindingTableEntryCount = 0,
+                     .ExpectedVertexCount = pipeline->gs_vertex_count,
+
+                     .ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_GEOMETRY],
+                     .PerThreadScratchSpace = ffs(gs_prog_data->base.base.total_scratch / 2048),
+
+                     .OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1,
+                     .OutputTopology = gs_prog_data->output_topology,
+                     .VertexURBEntryReadLength = gs_prog_data->base.urb_read_length,
+                     .IncludeVertexHandles = gs_prog_data->base.include_vue_handles,
+                     .DispatchGRFStartRegisterForURBData =
+                        gs_prog_data->base.base.dispatch_grf_start_reg,
+
+                     .MaximumNumberofThreads = device->info.max_gs_threads / 2 - 1,
+                     .ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords,
+                     .DispatchMode = gs_prog_data->base.dispatch_mode,
+                     .StatisticsEnable = true,
+                     .IncludePrimitiveID = gs_prog_data->include_primitive_id,
+                     .ReorderMode = TRAILING,
+                     .Enable = true,
+
+                     .ControlDataFormat = gs_prog_data->control_data_format,
+
+                     .StaticOutput = gs_prog_data->static_vertex_count >= 0,
+                     .StaticOutputVertexCount =
+                        gs_prog_data->static_vertex_count >= 0 ?
+                        gs_prog_data->static_vertex_count : 0,
+
+                     /* FIXME: mesa sets this based on ctx->Transform.ClipPlanesEnabled:
+                      * UserClipDistanceClipTestEnableBitmask_3DSTATE_GS(v)
+                      * UserClipDistanceCullTestEnableBitmask(v)
+                      */
+
+                     .VertexURBEntryOutputReadOffset = offset,
+                     .VertexURBEntryOutputLength = length);
+
+   const struct brw_vue_prog_data *vue_prog_data = &pipeline->vs_prog_data.base;
+   /* Skip the VUE header and position slots */
+   offset = 1;
+   length = (vue_prog_data->vue_map.num_slots + 1) / 2 - offset;
+
+   uint32_t vs_start = pipeline->vs_simd8 != NO_KERNEL ? pipeline->vs_simd8 :
+                                                         pipeline->vs_vec4;
+
+   if (vs_start == NO_KERNEL || (extra && extra->disable_vs))
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VS),
+                     .FunctionEnable = false,
+                     /* Even if VS is disabled, SBE still gets the amount of
+                      * vertex data to read from this field. */
+                     .VertexURBEntryOutputReadOffset = offset,
+                     .VertexURBEntryOutputLength = length);
+   else
+      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VS),
+                     .KernelStartPointer = vs_start,
+                     .SingleVertexDispatch = Multiple,
+                     .VectorMaskEnable = Dmask,
+                     .SamplerCount = 0,
+                     .BindingTableEntryCount =
+                     vue_prog_data->base.binding_table.size_bytes / 4,
+                     .ThreadDispatchPriority = Normal,
+                     .FloatingPointMode = IEEE754,
+                     .IllegalOpcodeExceptionEnable = false,
+                     .AccessesUAV = false,
+                     .SoftwareExceptionEnable = false,
+
+                     .ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_VERTEX],
+                     .PerThreadScratchSpace = ffs(vue_prog_data->base.total_scratch / 2048),
+
+                     .DispatchGRFStartRegisterForURBData =
+                     vue_prog_data->base.dispatch_grf_start_reg,
+                     .VertexURBEntryReadLength = vue_prog_data->urb_read_length,
+                     .VertexURBEntryReadOffset = 0,
+
+                     .MaximumNumberofThreads = device->info.max_vs_threads - 1,
+                     .StatisticsEnable = false,
+                     .SIMD8DispatchEnable = pipeline->vs_simd8 != NO_KERNEL,
+                     .VertexCacheDisable = false,
+                     .FunctionEnable = true,
+
+                     .VertexURBEntryOutputReadOffset = offset,
+                     .VertexURBEntryOutputLength = length,
+                     .UserClipDistanceClipTestEnableBitmask = 0,
+                     .UserClipDistanceCullTestEnableBitmask = 0);
+
+   const struct brw_wm_prog_data *wm_prog_data = &pipeline->wm_prog_data;
+
+   /* TODO: We should clean this up.  Among other things, this is mostly
+    * shared with other gens.
+    */
+   const struct brw_vue_map *fs_input_map;
+   if (pipeline->gs_kernel == NO_KERNEL)
+      fs_input_map = &vue_prog_data->vue_map;
+   else
+      fs_input_map = &gs_prog_data->base.vue_map;
+
+   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
+      GENX(3DSTATE_SBE_SWIZ_header),
+   };
+
+   int max_source_attr = 0;
+   for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
+      int input_index = wm_prog_data->urb_setup[attr];
+
+      if (input_index < 0)
+        continue;
+
+      int source_attr = fs_input_map->varying_to_slot[attr];
+      max_source_attr = MAX2(max_source_attr, source_attr);
+
+      if (input_index >= 16)
+        continue;
+
+      if (source_attr == -1) {
+         /* This attribute does not exist in the VUE--that means that the
+          * vertex shader did not write to it.  It could be that it's a
+          * regular varying read by the fragment shader but not written by the
+          * vertex shader or it's gl_PrimitiveID. In the first case the value
+          * is undefined, in the second it needs to be gl_PrimitiveID.
+          */
+         swiz.Attribute[input_index].ConstantSource = PRIM_ID;
+         swiz.Attribute[input_index].ComponentOverrideX = true;
+         swiz.Attribute[input_index].ComponentOverrideY = true;
+         swiz.Attribute[input_index].ComponentOverrideZ = true;
+         swiz.Attribute[input_index].ComponentOverrideW = true;
+      } else {
+         /* We have to subtract two slots to accout for the URB entry output
+          * read offset in the VS and GS stages.
+          */
+         swiz.Attribute[input_index].SourceAttribute = source_attr - 2;
+      }
+   }
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_SBE),
+                  .AttributeSwizzleEnable = true,
+                  .ForceVertexURBEntryReadLength = false,
+                  .ForceVertexURBEntryReadOffset = false,
+                  .VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2),
+                  .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
+                  .NumberofSFOutputAttributes =
+                     wm_prog_data->num_varying_inputs,
+
+#if ANV_GEN >= 9
+                  .Attribute0ActiveComponentFormat = ACF_XYZW,
+                  .Attribute1ActiveComponentFormat = ACF_XYZW,
+                  .Attribute2ActiveComponentFormat = ACF_XYZW,
+                  .Attribute3ActiveComponentFormat = ACF_XYZW,
+                  .Attribute4ActiveComponentFormat = ACF_XYZW,
+                  .Attribute5ActiveComponentFormat = ACF_XYZW,
+                  .Attribute6ActiveComponentFormat = ACF_XYZW,
+                  .Attribute7ActiveComponentFormat = ACF_XYZW,
+                  .Attribute8ActiveComponentFormat = ACF_XYZW,
+                  .Attribute9ActiveComponentFormat = ACF_XYZW,
+                  .Attribute10ActiveComponentFormat = ACF_XYZW,
+                  .Attribute11ActiveComponentFormat = ACF_XYZW,
+                  .Attribute12ActiveComponentFormat = ACF_XYZW,
+                  .Attribute13ActiveComponentFormat = ACF_XYZW,
+                  .Attribute14ActiveComponentFormat = ACF_XYZW,
+                  .Attribute15ActiveComponentFormat = ACF_XYZW,
+                  /* wow, much field, very attribute */
+                  .Attribute16ActiveComponentFormat = ACF_XYZW,
+                  .Attribute17ActiveComponentFormat = ACF_XYZW,
+                  .Attribute18ActiveComponentFormat = ACF_XYZW,
+                  .Attribute19ActiveComponentFormat = ACF_XYZW,
+                  .Attribute20ActiveComponentFormat = ACF_XYZW,
+                  .Attribute21ActiveComponentFormat = ACF_XYZW,
+                  .Attribute22ActiveComponentFormat = ACF_XYZW,
+                  .Attribute23ActiveComponentFormat = ACF_XYZW,
+                  .Attribute24ActiveComponentFormat = ACF_XYZW,
+                  .Attribute25ActiveComponentFormat = ACF_XYZW,
+                  .Attribute26ActiveComponentFormat = ACF_XYZW,
+                  .Attribute27ActiveComponentFormat = ACF_XYZW,
+                  .Attribute28ActiveComponentFormat = ACF_XYZW,
+                  .Attribute29ActiveComponentFormat = ACF_XYZW,
+                  .Attribute28ActiveComponentFormat = ACF_XYZW,
+                  .Attribute29ActiveComponentFormat = ACF_XYZW,
+                  .Attribute30ActiveComponentFormat = ACF_XYZW,
+#endif
+      );
+
+   uint32_t *dw = anv_batch_emit_dwords(&pipeline->batch,
+                                        GENX(3DSTATE_SBE_SWIZ_length));
+   GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->batch, dw, &swiz);
+
+   const int num_thread_bias = ANV_GEN == 8 ? 2 : 1;
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS),
+                  .KernelStartPointer0 = pipeline->ps_ksp0,
+
+                  .SingleProgramFlow = false,
+                  .VectorMaskEnable = true,
+                  .SamplerCount = 1,
+
+                  .ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_FRAGMENT],
+                  .PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch / 2048),
+
+                  .MaximumNumberofThreadsPerPSD = 64 - num_thread_bias,
+                  .PositionXYOffsetSelect = wm_prog_data->uses_pos_offset ?
+                     POSOFFSET_SAMPLE: POSOFFSET_NONE,
+                  .PushConstantEnable = wm_prog_data->base.nr_params > 0,
+                  ._8PixelDispatchEnable = pipeline->ps_simd8 != NO_KERNEL,
+                  ._16PixelDispatchEnable = pipeline->ps_simd16 != NO_KERNEL,
+                  ._32PixelDispatchEnable = false,
+
+                  .DispatchGRFStartRegisterForConstantSetupData0 = pipeline->ps_grf_start0,
+                  .DispatchGRFStartRegisterForConstantSetupData1 = 0,
+                  .DispatchGRFStartRegisterForConstantSetupData2 = pipeline->ps_grf_start2,
+
+                  .KernelStartPointer1 = 0,
+                  .KernelStartPointer2 = pipeline->ps_ksp2);
+
+   bool per_sample_ps = false;
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_EXTRA),
+                  .PixelShaderValid = true,
+                  .PixelShaderKillsPixel = wm_prog_data->uses_kill,
+                  .PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode,
+                  .AttributeEnable = wm_prog_data->num_varying_inputs > 0,
+                  .oMaskPresenttoRenderTarget = wm_prog_data->uses_omask,
+                  .PixelShaderIsPerSample = per_sample_ps,
+#if ANV_GEN >= 9
+                  .PixelShaderPullsBary = wm_prog_data->pulls_bary,
+                  .InputCoverageMaskState = ICMS_NONE
+#endif
+      );
+
+   *pPipeline = anv_pipeline_to_handle(pipeline);
+
+   return VK_SUCCESS;
+}
+
+VkResult genX(compute_pipeline_create)(
+    VkDevice                                    _device,
+    struct anv_pipeline_cache *                 cache,
+    const VkComputePipelineCreateInfo*          pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipeline)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_pipeline *pipeline;
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
+
+   pipeline = anv_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   pipeline->device = device;
+   pipeline->layout = anv_pipeline_layout_from_handle(pCreateInfo->layout);
+
+   pipeline->blend_state.map = NULL;
+
+   result = anv_reloc_list_init(&pipeline->batch_relocs,
+                                pAllocator ? pAllocator : &device->alloc);
+   if (result != VK_SUCCESS) {
+      anv_free2(&device->alloc, pAllocator, pipeline);
+      return result;
+   }
+   pipeline->batch.next = pipeline->batch.start = pipeline->batch_data;
+   pipeline->batch.end = pipeline->batch.start + sizeof(pipeline->batch_data);
+   pipeline->batch.relocs = &pipeline->batch_relocs;
+
+   /* When we free the pipeline, we detect stages based on the NULL status
+    * of various prog_data pointers.  Make them NULL by default.
+    */
+   memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
+   memset(pipeline->scratch_start, 0, sizeof(pipeline->scratch_start));
+
+   pipeline->vs_simd8 = NO_KERNEL;
+   pipeline->vs_vec4 = NO_KERNEL;
+   pipeline->gs_kernel = NO_KERNEL;
+
+   pipeline->active_stages = 0;
+   pipeline->total_scratch = 0;
+
+   assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
+   ANV_FROM_HANDLE(anv_shader_module, module,  pCreateInfo->stage.module);
+   anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
+                           pCreateInfo->stage.pName,
+                           pCreateInfo->stage.pSpecializationInfo);
+
+   pipeline->use_repclear = false;
+
+   const struct brw_cs_prog_data *cs_prog_data = &pipeline->cs_prog_data;
+
+   anv_batch_emit(&pipeline->batch, GENX(MEDIA_VFE_STATE),
+                  .ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_COMPUTE],
+                  .PerThreadScratchSpace = ffs(cs_prog_data->base.total_scratch / 2048),
+                  .ScratchSpaceBasePointerHigh = 0,
+                  .StackSize = 0,
+
+                  .MaximumNumberofThreads = device->info.max_cs_threads - 1,
+                  .NumberofURBEntries = 2,
+                  .ResetGatewayTimer = true,
+#if ANV_GEN == 8
+                  .BypassGatewayControl = true,
+#endif
+                  .URBEntryAllocationSize = 2,
+                  .CURBEAllocationSize = 0);
+
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+   uint32_t group_size = prog_data->local_size[0] *
+      prog_data->local_size[1] * prog_data->local_size[2];
+   pipeline->cs_thread_width_max = DIV_ROUND_UP(group_size, prog_data->simd_size);
+   uint32_t remainder = group_size & (prog_data->simd_size - 1);
+
+   if (remainder > 0)
+      pipeline->cs_right_mask = ~0u >> (32 - remainder);
+   else
+      pipeline->cs_right_mask = ~0u >> (32 - prog_data->simd_size);
+
+
+   *pPipeline = anv_pipeline_to_handle(pipeline);
+
+   return VK_SUCCESS;
+}
diff --git a/src/vulkan/gen8_state.c b/src/vulkan/gen8_state.c

new file mode 100644 (file)

index 0000000..d42d0b1
--- /dev/null
+++ b/src/vulkan/gen8_state.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "gen8_pack.h"
+#include "gen9_pack.h"
+
+#include "genX_state_util.h"
+
+void
+genX(fill_buffer_surface_state)(void *state, enum isl_format format,
+                                uint32_t offset, uint32_t range, uint32_t stride)
+{
+   uint32_t num_elements = range / stride;
+
+   struct GENX(RENDER_SURFACE_STATE) surface_state = {
+      .SurfaceType = SURFTYPE_BUFFER,
+      .SurfaceArray = false,
+      .SurfaceFormat = format,
+      .SurfaceVerticalAlignment = VALIGN4,
+      .SurfaceHorizontalAlignment = HALIGN4,
+      .TileMode = LINEAR,
+      .SamplerL2BypassModeDisable = true,
+      .RenderCacheReadWriteMode = WriteOnlyCache,
+      .MemoryObjectControlState = GENX(MOCS),
+      .Height = ((num_elements - 1) >> 7) & 0x3fff,
+      .Width = (num_elements - 1) & 0x7f,
+      .Depth = ((num_elements - 1) >> 21) & 0x3f,
+      .SurfacePitch = stride - 1,
+      .NumberofMultisamples = MULTISAMPLECOUNT_1,
+      .ShaderChannelSelectRed = SCS_RED,
+      .ShaderChannelSelectGreen = SCS_GREEN,
+      .ShaderChannelSelectBlue = SCS_BLUE,
+      .ShaderChannelSelectAlpha = SCS_ALPHA,
+      /* FIXME: We assume that the image must be bound at this time. */
+      .SurfaceBaseAddress = { NULL, offset },
+   };
+
+   GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &surface_state);
+}
+
+static const uint8_t anv_halign[] = {
+    [4] = HALIGN4,
+    [8] = HALIGN8,
+    [16] = HALIGN16,
+};
+
+static const uint8_t anv_valign[] = {
+    [4] = VALIGN4,
+    [8] = VALIGN8,
+    [16] = VALIGN16,
+};
+
+static struct anv_state
+alloc_surface_state(struct anv_device *device,
+                    struct anv_cmd_buffer *cmd_buffer)
+{
+      if (cmd_buffer) {
+         return anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+      } else {
+         return anv_state_pool_alloc(&device->surface_state_pool, 64, 64);
+      }
+}
+
+/**
+ * Get the values to pack into RENDER_SUFFACE_STATE.SurfaceHorizontalAlignment
+ * and SurfaceVerticalAlignment.
+ */
+static void
+get_halign_valign(const struct isl_surf *surf, uint32_t *halign, uint32_t *valign)
+{
+   #if ANV_GENx10 >= 90
+      if (isl_tiling_is_std_y(surf->tiling) ||
+          surf->dim_layout == ISL_DIM_LAYOUT_GEN9_1D) {
+         /* The hardware ignores the alignment values. Anyway, the surface's
+          * true alignment is likely outside the enum range of HALIGN* and
+          * VALIGN*.
+          */
+         *halign = 0;
+         *valign = 0;
+      } else {
+         /* In Skylake, RENDER_SUFFACE_STATE.SurfaceVerticalAlignment is in units
+          * of surface elements (not pixels nor samples). For compressed formats,
+          * a "surface element" is defined as a compression block.  For example,
+          * if SurfaceVerticalAlignment is VALIGN_4 and SurfaceFormat is an ETC2
+          * format (ETC2 has a block height of 4), then the vertical alignment is
+          * 4 compression blocks or, equivalently, 16 pixels.
+          */
+         struct isl_extent3d image_align_el
+            = isl_surf_get_image_alignment_el(surf);
+
+         *halign = anv_halign[image_align_el.width];
+         *valign = anv_valign[image_align_el.height];
+      }
+   #else
+      /* Pre-Skylake, RENDER_SUFFACE_STATE.SurfaceVerticalAlignment is in
+       * units of surface samples.  For example, if SurfaceVerticalAlignment
+       * is VALIGN_4 and the surface is singlesampled, then for any surface
+       * format (compressed or not) the vertical alignment is
+       * 4 pixels.
+       */
+      struct isl_extent3d image_align_sa
+         = isl_surf_get_image_alignment_sa(surf);
+
+      *halign = anv_halign[image_align_sa.width];
+      *valign = anv_valign[image_align_sa.height];
+   #endif
+}
+
+static uint32_t
+get_qpitch(const struct isl_surf *surf)
+{
+   switch (surf->dim) {
+   default:
+      unreachable(!"bad isl_surf_dim");
+   case ISL_SURF_DIM_1D:
+      #if ANV_GENx10 >= 90
+         /* QPitch is usually expressed as rows of surface elements (where
+          * a surface element is an compression block or a single surface
+          * sample). Skylake 1D is an outlier.
+          *
+          * From the Skylake BSpec >> Memory Views >> Common Surface
+          * Formats >> Surface Layout and Tiling >> 1D Surfaces:
+          *
+          *    Surface QPitch specifies the distance in pixels between array
+          *    slices.
+          */
+         return isl_surf_get_array_pitch_el(surf);
+      #else
+         return isl_surf_get_array_pitch_el_rows(surf);
+      #endif
+   case ISL_SURF_DIM_2D:
+   case ISL_SURF_DIM_3D:
+      return isl_surf_get_array_pitch_el_rows(surf);
+   }
+}
+
+void
+genX(image_view_init)(struct anv_image_view *iview,
+                      struct anv_device *device,
+                      const VkImageViewCreateInfo* pCreateInfo,
+                      struct anv_cmd_buffer *cmd_buffer)
+{
+   ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
+
+   const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
+
+   struct anv_surface *surface =
+      anv_image_get_surface_for_aspect_mask(image, range->aspectMask);
+
+   static const uint8_t isl_to_gen_tiling[] = {
+      [ISL_TILING_LINEAR]  = LINEAR,
+      [ISL_TILING_X]       = XMAJOR,
+      [ISL_TILING_Y0]      = YMAJOR,
+      [ISL_TILING_Yf]      = YMAJOR,
+      [ISL_TILING_Ys]      = YMAJOR,
+      [ISL_TILING_W]       = WMAJOR,
+   };
+
+   uint32_t halign, valign;
+   get_halign_valign(&surface->isl, &halign, &valign);
+
+   struct GENX(RENDER_SURFACE_STATE) template = {
+      .SurfaceType = anv_surftype(image, pCreateInfo->viewType, false),
+      .SurfaceArray = image->array_size > 1,
+      .SurfaceFormat = iview->format,
+      .SurfaceVerticalAlignment = valign,
+      .SurfaceHorizontalAlignment = halign,
+      .TileMode = isl_to_gen_tiling[surface->isl.tiling],
+      .VerticalLineStride = 0,
+      .VerticalLineStrideOffset = 0,
+      .SamplerL2BypassModeDisable = true,
+      .RenderCacheReadWriteMode = WriteOnlyCache,
+      .MemoryObjectControlState = GENX(MOCS),
+
+      /* The driver sets BaseMipLevel in SAMPLER_STATE, not here in
+       * RENDER_SURFACE_STATE. The Broadwell PRM says "it is illegal to have
+       * both Base Mip Level fields nonzero".
+       */
+      .BaseMipLevel = 0.0,
+
+      .SurfaceQPitch = get_qpitch(&surface->isl) >> 2,
+      .Height = image->extent.height - 1,
+      .Width = image->extent.width - 1,
+      .Depth = 0, /* TEMPLATE */
+      .SurfacePitch = surface->isl.row_pitch - 1,
+      .RenderTargetViewExtent = 0, /* TEMPLATE */
+      .MinimumArrayElement = 0, /* TEMPLATE */
+      .NumberofMultisamples = MULTISAMPLECOUNT_1,
+      .XOffset = 0,
+      .YOffset = 0,
+
+      .MIPCountLOD = 0, /* TEMPLATE */
+      .SurfaceMinLOD = 0, /* TEMPLATE */
+
+      .AuxiliarySurfaceMode = AUX_NONE,
+      .RedClearColor = 0,
+      .GreenClearColor = 0,
+      .BlueClearColor = 0,
+      .AlphaClearColor = 0,
+      .ShaderChannelSelectRed = vk_to_gen_swizzle(pCreateInfo->components.r,
+                                                  VK_COMPONENT_SWIZZLE_R),
+      .ShaderChannelSelectGreen = vk_to_gen_swizzle(pCreateInfo->components.g,
+                                                    VK_COMPONENT_SWIZZLE_G),
+      .ShaderChannelSelectBlue = vk_to_gen_swizzle(pCreateInfo->components.b,
+                                                   VK_COMPONENT_SWIZZLE_B),
+      .ShaderChannelSelectAlpha = vk_to_gen_swizzle(pCreateInfo->components.a,
+                                                    VK_COMPONENT_SWIZZLE_A),
+      .ResourceMinLOD = 0.0,
+      .SurfaceBaseAddress = { NULL, iview->offset },
+   };
+
+   switch (template.SurfaceType) {
+   case SURFTYPE_1D:
+   case SURFTYPE_2D:
+      template.MinimumArrayElement = range->baseArrayLayer;
+
+      /* From the Broadwell PRM >> RENDER_SURFACE_STATE::Depth:
+       *
+       *    For SURFTYPE_1D, 2D, and CUBE: The range of this field is reduced
+       *    by one for each increase from zero of Minimum Array Element. For
+       *    example, if Minimum Array Element is set to 1024 on a 2D surface,
+       *    the range of this field is reduced to [0,1023].
+       *
+       * In other words, 'Depth' is the number of array layers.
+       */
+      template.Depth = range->layerCount - 1;
+
+      /* From the Broadwell PRM >> RENDER_SURFACE_STATE::RenderTargetViewExtent:
+       *
+       *    For Render Target and Typed Dataport 1D and 2D Surfaces:
+       *    This field must be set to the same value as the Depth field.
+       */
+      template.RenderTargetViewExtent = template.Depth;
+      break;
+   case SURFTYPE_CUBE:
+      #if ANV_GENx10 >= 90
+         /* Like SURFTYPE_2D, but divided by 6. */
+         template.MinimumArrayElement = range->baseArrayLayer / 6;
+         template.Depth = range->layerCount / 6 - 1;
+         template.RenderTargetViewExtent = template.Depth;
+      #else
+         /* Same as SURFTYPE_2D */
+         template.MinimumArrayElement = range->baseArrayLayer;
+         template.Depth = range->layerCount - 1;
+         template.RenderTargetViewExtent = template.Depth;
+      #endif
+      break;
+   case SURFTYPE_3D:
+      template.MinimumArrayElement = range->baseArrayLayer;
+
+      /* From the Broadwell PRM >> RENDER_SURFACE_STATE::Depth:
+       *
+       *    If the volume texture is MIP-mapped, this field specifies the
+       *    depth of the base MIP level.
+       */
+      template.Depth = image->extent.depth - 1;
+
+      /* From the Broadwell PRM >> RENDER_SURFACE_STATE::RenderTargetViewExtent:
+       *
+       *    For Render Target and Typed Dataport 3D Surfaces: This field
+       *    indicates the extent of the accessible 'R' coordinates minus 1 on
+       *    the LOD currently being rendered to.
+       */
+      template.RenderTargetViewExtent = iview->extent.depth - 1;
+      break;
+   default:
+      unreachable(!"bad SurfaceType");
+   }
+
+   if (image->needs_nonrt_surface_state) {
+      struct GENX(RENDER_SURFACE_STATE) surface_state = template;
+
+      iview->nonrt_surface_state =
+         alloc_surface_state(device, cmd_buffer);
+
+      /* For non render target surfaces, the hardware interprets field
+       * MIPCount/LOD as MIPCount.  The range of levels accessible by the
+       * sampler engine is [SurfaceMinLOD, SurfaceMinLOD + MIPCountLOD].
+       */
+      surface_state.SurfaceMinLOD = range->baseMipLevel;
+      surface_state.MIPCountLOD = MAX2(range->levelCount, 1) - 1;
+
+      GENX(RENDER_SURFACE_STATE_pack)(NULL, iview->nonrt_surface_state.map,
+                                      &surface_state);
+      if (!device->info.has_llc)
+         anv_state_clflush(iview->nonrt_surface_state);
+   } else {
+      iview->nonrt_surface_state.alloc_size = 0;
+   }
+
+   if (image->needs_color_rt_surface_state) {
+      struct GENX(RENDER_SURFACE_STATE) surface_state = template;
+
+      iview->color_rt_surface_state =
+         alloc_surface_state(device, cmd_buffer);
+
+      /* For render target surfaces, the hardware interprets field
+       * MIPCount/LOD as LOD. The Broadwell PRM says:
+       *
+       *    MIPCountLOD defines the LOD that will be rendered into.
+       *    SurfaceMinLOD is ignored.
+       */
+      surface_state.MIPCountLOD = range->baseMipLevel;
+      surface_state.SurfaceMinLOD = 0;
+
+      GENX(RENDER_SURFACE_STATE_pack)(NULL, iview->color_rt_surface_state.map,
+                                      &surface_state);
+      if (!device->info.has_llc)
+         anv_state_clflush(iview->color_rt_surface_state);
+   } else {
+      iview->color_rt_surface_state.alloc_size = 0;
+   }
+
+   if (image->needs_storage_surface_state) {
+      struct GENX(RENDER_SURFACE_STATE) surface_state = template;
+
+      iview->storage_surface_state =
+         alloc_surface_state(device, cmd_buffer);
+
+      surface_state.SurfaceType =
+         anv_surftype(image, pCreateInfo->viewType, true),
+
+      surface_state.SurfaceFormat =
+         isl_lower_storage_image_format(&device->isl_dev, iview->format);
+
+      surface_state.SurfaceMinLOD = range->baseMipLevel;
+      surface_state.MIPCountLOD = MAX2(range->levelCount, 1) - 1;
+
+      GENX(RENDER_SURFACE_STATE_pack)(NULL, iview->storage_surface_state.map,
+                                      &surface_state);
+   } else {
+      iview->storage_surface_state.alloc_size = 0;
+   }
+}
+
+VkResult genX(CreateSampler)(
+    VkDevice                                    _device,
+    const VkSamplerCreateInfo*                  pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSampler*                                  pSampler)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_sampler *sampler;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO);
+
+   sampler = anv_alloc2(&device->alloc, pAllocator, sizeof(*sampler), 8,
+                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!sampler)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   uint32_t filter = vk_to_gen_tex_filter(pCreateInfo->magFilter,
+                                          pCreateInfo->anisotropyEnable);
+
+   uint32_t border_color_offset = device->border_colors.offset +
+                                  pCreateInfo->borderColor * 64;
+
+   struct GENX(SAMPLER_STATE) sampler_state = {
+      .SamplerDisable = false,
+      .TextureBorderColorMode = DX10OGL,
+      .LODPreClampMode = CLAMP_OGL,
+#if ANV_GEN == 8
+      .BaseMipLevel = 0.0,
+#endif
+      .MipModeFilter = vk_to_gen_mipmap_mode[pCreateInfo->mipmapMode],
+      .MagModeFilter = filter,
+      .MinModeFilter = filter,
+      .TextureLODBias = anv_clamp_f(pCreateInfo->mipLodBias, -16, 15.996),
+      .AnisotropicAlgorithm = EWAApproximation,
+      .MinLOD = anv_clamp_f(pCreateInfo->minLod, 0, 14),
+      .MaxLOD = anv_clamp_f(pCreateInfo->maxLod, 0, 14),
+      .ChromaKeyEnable = 0,
+      .ChromaKeyIndex = 0,
+      .ChromaKeyMode = 0,
+      .ShadowFunction = vk_to_gen_compare_op[pCreateInfo->compareOp],
+      .CubeSurfaceControlMode = 0,
+
+      .IndirectStatePointer = border_color_offset >> 6,
+
+      .LODClampMagnificationMode = MIPNONE,
+      .MaximumAnisotropy = vk_to_gen_max_anisotropy(pCreateInfo->maxAnisotropy),
+      .RAddressMinFilterRoundingEnable = 0,
+      .RAddressMagFilterRoundingEnable = 0,
+      .VAddressMinFilterRoundingEnable = 0,
+      .VAddressMagFilterRoundingEnable = 0,
+      .UAddressMinFilterRoundingEnable = 0,
+      .UAddressMagFilterRoundingEnable = 0,
+      .TrilinearFilterQuality = 0,
+      .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
+      .TCXAddressControlMode = vk_to_gen_tex_address[pCreateInfo->addressModeU],
+      .TCYAddressControlMode = vk_to_gen_tex_address[pCreateInfo->addressModeV],
+      .TCZAddressControlMode = vk_to_gen_tex_address[pCreateInfo->addressModeW],
+   };
+
+   GENX(SAMPLER_STATE_pack)(NULL, sampler->state, &sampler_state);
+
+   *pSampler = anv_sampler_to_handle(sampler);
+
+   return VK_SUCCESS;
+}
diff --git a/src/vulkan/gen9_pack.h b/src/vulkan/gen9_pack.h

new file mode 100644 (file)

index 0000000..db54e9c
--- /dev/null
+++ b/src/vulkan/gen9_pack.h
@@ -0,0 +1,9797 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+/* Instructions, enums and structures for SKL.
+ *
+ * This file has been generated, do not hand edit.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <assert.h>
+
+#ifndef __gen_validate_value
+#define __gen_validate_value(x)
+#endif
+
+#ifndef __gen_field_functions
+#define __gen_field_functions
+
+union __gen_value {
+   float f;
+   uint32_t dw;
+};
+
+static inline uint64_t
+__gen_mbo(uint32_t start, uint32_t end)
+{
+   return (~0ul >> (64 - (end - start + 1))) << start;
+}
+
+static inline uint64_t
+__gen_field(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   if (end - start + 1 < 64)
+      assert(v < 1ul << (end - start + 1));
+#endif
+
+   return v << start;
+}
+
+static inline uint64_t
+__gen_fixed(float v, uint32_t start, uint32_t end,
+            bool is_signed, uint32_t fract_bits)
+{
+   __gen_validate_value(v);
+
+   const float factor = (1 << fract_bits);
+
+   float max, min;
+   if (is_signed) {
+      max = ((1 << (end - start)) - 1) / factor;
+      min = -(1 << (end - start)) / factor;
+   } else {
+      max = ((1 << (end - start + 1)) - 1) / factor;
+      min = 0.0f;
+   }
+
+   if (v > max)
+      v = max;
+   else if (v < min)
+      v = min;
+
+   int32_t int_val = roundf(v * factor);
+
+   if (is_signed)
+      int_val &= (1 << (end - start + 1)) - 1;
+
+   return int_val << start;
+}
+
+static inline uint64_t
+__gen_offset(uint64_t v, uint32_t start, uint32_t end)
+{
+   __gen_validate_value(v);
+#if DEBUG
+   uint64_t mask = (~0ul >> (64 - (end - start + 1))) << start;
+
+   assert((v & ~mask) == 0);
+#endif
+
+   return v;
+}
+
+static inline uint32_t
+__gen_float(float v)
+{
+   __gen_validate_value(v);
+   return ((union __gen_value) { .f = (v) }).dw;
+}
+
+#ifndef __gen_address_type
+#error #define __gen_address_type before including this file
+#endif
+
+#ifndef __gen_user_data
+#error #define __gen_combine_address before including this file
+#endif
+
+#endif
+
+#define GEN9_3DSTATE_URB_VS_length_bias 0x00000002
+#define GEN9_3DSTATE_URB_VS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 48,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_URB_VS_length 0x00000002
+
+struct GEN9_3DSTATE_URB_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     VSURBStartingAddress;
+   uint32_t                                     VSURBEntryAllocationSize;
+   uint32_t                                     VSNumberofURBEntries;
+};
+
+static inline void
+GEN9_3DSTATE_URB_VS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN9_3DSTATE_URB_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->VSURBStartingAddress, 25, 31) |
+      __gen_field(values->VSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->VSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VS_length_bias 0x00000002
+#define GEN9_3DSTATE_VS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 16,                  \
+   .DwordLength          =  7
+
+#define GEN9_3DSTATE_VS_length 0x00000009
+
+#define __gen_prefix(name) GEN9_ ## name
+
+struct __gen_prefix(3DSTATE_VS) {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     KernelStartPointer;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleVertexDispatch;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         AccessesUAV;
+   bool                                         SoftwareExceptionEnable;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         StatisticsEnable;
+   bool                                         SIMD8DispatchEnable;
+   bool                                         VertexCacheDisable;
+   bool                                         FunctionEnable;
+   uint32_t                                     VertexURBEntryOutputReadOffset;
+   uint32_t                                     VertexURBEntryOutputLength;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+};
+
+static inline void
+GEN9_3DSTATE_VS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->KernelStartPointer, 6, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->SingleVertexDispatch, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->AccessesUAV, 12, 12) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   uint64_t qw4 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   dw[6] =
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 20, 24) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->MaximumNumberofThreads, 23, 31) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->SIMD8DispatchEnable, 2, 2) |
+      __gen_field(values->VertexCacheDisable, 1, 1) |
+      __gen_field(values->FunctionEnable, 0, 0) |
+      0;
+
+   dw[8] =
+      __gen_field(values->VertexURBEntryOutputReadOffset, 21, 26) |
+      __gen_field(values->VertexURBEntryOutputLength, 16, 20) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 8, 15) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+}
+
+#define GEN9_GPGPU_CSR_BASE_ADDRESS_length_bias 0x00000002
+#define GEN9_GPGPU_CSR_BASE_ADDRESS_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  1
+
+#define GEN9_GPGPU_CSR_BASE_ADDRESS_length 0x00000003
+
+struct GEN9_GPGPU_CSR_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GPGPUCSRBaseAddress;
+};
+
+static inline void
+GEN9_GPGPU_CSR_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN9_GPGPU_CSR_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->GPGPUCSRBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+}
+
+#define GEN9_MI_ATOMIC_length_bias 0x00000002
+#define GEN9_MI_ATOMIC_header                   \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 47
+
+#define GEN9_MI_ATOMIC_length 0x00000003
+
+struct __gen_prefix(MI_ATOMIC) {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     MemoryType;
+   uint32_t                                     PostSyncOperation;
+#define     DWORD                                              0
+#define     QWORD                                              1
+#define     OCTWORD                                            2
+#define     RESERVED                                           3
+   uint32_t                                     DataSize;
+   uint32_t                                     InlineData;
+   uint32_t                                     CSSTALL;
+   uint32_t                                     ReturnDataControl;
+   uint32_t                                     ATOMICOPCODE;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           MemoryAddress;
+   uint32_t                                     Operand1DataDword0;
+   uint32_t                                     Operand2DataDword0;
+   uint32_t                                     Operand1DataDword1;
+   uint32_t                                     Operand2DataDword1;
+   uint32_t                                     Operand1DataDword2;
+   uint32_t                                     Operand2DataDword2;
+   uint32_t                                     Operand1DataDword3;
+   uint32_t                                     Operand2DataDword3;
+};
+
+static inline void
+GEN9_MI_ATOMIC_pack(__gen_user_data *data, void * restrict dst,
+                    const struct GEN9_MI_ATOMIC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->MemoryType, 22, 22) |
+      __gen_field(values->PostSyncOperation, 21, 21) |
+      __gen_field(values->DataSize, 19, 20) |
+      __gen_field(values->InlineData, 18, 18) |
+      __gen_field(values->CSSTALL, 17, 17) |
+      __gen_field(values->ReturnDataControl, 16, 16) |
+      __gen_field(values->ATOMICOPCODE, 8, 15) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->MemoryAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->Operand1DataDword0, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->Operand2DataDword0, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->Operand1DataDword1, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->Operand2DataDword1, 0, 31) |
+      0;
+
+   dw[7] =
+      __gen_field(values->Operand1DataDword2, 0, 31) |
+      0;
+
+   dw[8] =
+      __gen_field(values->Operand2DataDword2, 0, 31) |
+      0;
+
+   dw[9] =
+      __gen_field(values->Operand1DataDword3, 0, 31) |
+      0;
+
+   dw[10] =
+      __gen_field(values->Operand2DataDword3, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MI_BATCH_BUFFER_START_length_bias 0x00000002
+#define GEN9_MI_BATCH_BUFFER_START_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 49,                  \
+   .DwordLength          =  1
+
+#define GEN9_MI_BATCH_BUFFER_START_length 0x00000003
+
+struct GEN9_MI_BATCH_BUFFER_START {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     Firstlevelbatch                                    0
+#define     Secondlevelbatch                                   1
+   uint32_t                                     SecondLevelBatchBuffer;
+   bool                                         AddOffsetEnable;
+   uint32_t                                     PredicationEnable;
+   bool                                         ResourceStreamerEnable;
+#define     ASI_GGTT                                           0
+#define     ASI_PPGTT                                          1
+   uint32_t                                     AddressSpaceIndicator;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BatchBufferStartAddress;
+};
+
+static inline void
+GEN9_MI_BATCH_BUFFER_START_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN9_MI_BATCH_BUFFER_START * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->SecondLevelBatchBuffer, 22, 22) |
+      __gen_field(values->AddOffsetEnable, 16, 16) |
+      __gen_field(values->PredicationEnable, 15, 15) |
+      __gen_field(values->ResourceStreamerEnable, 10, 10) |
+      __gen_field(values->AddressSpaceIndicator, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->BatchBufferStartAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+}
+
+#define GEN9_MI_CONDITIONAL_BATCH_BUFFER_END_length_bias 0x00000002
+#define GEN9_MI_CONDITIONAL_BATCH_BUFFER_END_header\
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 54,                  \
+   .UseGlobalGTT         =  0,                  \
+   .CompareSemaphore     =  0,                  \
+   .DwordLength          =  2
+
+#define GEN9_MI_CONDITIONAL_BATCH_BUFFER_END_length 0x00000004
+
+struct GEN9_MI_CONDITIONAL_BATCH_BUFFER_END {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     CompareSemaphore;
+#define     CompareMaskModeDisabled                            0
+#define     CompareMaskModeEnabled                             1
+   uint32_t                                     CompareMaskMode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CompareDataDword;
+   __gen_address_type                           CompareAddress;
+};
+
+static inline void
+GEN9_MI_CONDITIONAL_BATCH_BUFFER_END_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN9_MI_CONDITIONAL_BATCH_BUFFER_END * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->CompareSemaphore, 21, 21) |
+      __gen_field(values->CompareMaskMode, 19, 19) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->CompareDataDword, 0, 31) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->CompareAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN9_MI_FORCE_WAKEUP_length_bias 0x00000002
+#define GEN9_MI_FORCE_WAKEUP_header             \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 29,                  \
+   .DwordLength          =  0
+
+#define GEN9_MI_FORCE_WAKEUP_length 0x00000002
+
+struct GEN9_MI_FORCE_WAKEUP {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     MaskBits;
+   uint32_t                                     ForceRenderAwake;
+   uint32_t                                     ForceMediaAwake;
+};
+
+static inline void
+GEN9_MI_FORCE_WAKEUP_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN9_MI_FORCE_WAKEUP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MaskBits, 16, 31) |
+      __gen_field(values->ForceRenderAwake, 1, 1) |
+      __gen_field(values->ForceMediaAwake, 0, 0) |
+      0;
+
+}
+
+#define GEN9_MI_LOAD_REGISTER_IMM_length_bias 0x00000002
+#define GEN9_MI_LOAD_REGISTER_IMM_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 34,                  \
+   .DwordLength          =  1
+
+#define GEN9_MI_LOAD_REGISTER_IMM_length 0x00000003
+
+struct GEN9_MI_LOAD_REGISTER_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     ByteWriteDisables;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterOffset;
+   uint32_t                                     DataDWord;
+};
+
+static inline void
+GEN9_MI_LOAD_REGISTER_IMM_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_MI_LOAD_REGISTER_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ByteWriteDisables, 8, 11) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterOffset, 2, 22) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DataDWord, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MI_LOAD_REGISTER_REG_length_bias 0x00000002
+#define GEN9_MI_LOAD_REGISTER_REG_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 42,                  \
+   .DwordLength          =  1
+
+#define GEN9_MI_LOAD_REGISTER_REG_length 0x00000003
+
+struct GEN9_MI_LOAD_REGISTER_REG {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SourceRegisterAddress;
+   uint32_t                                     DestinationRegisterAddress;
+};
+
+static inline void
+GEN9_MI_LOAD_REGISTER_REG_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_MI_LOAD_REGISTER_REG * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SourceRegisterAddress, 2, 22) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->DestinationRegisterAddress, 2, 22) |
+      0;
+
+}
+
+#define GEN9_MI_SEMAPHORE_SIGNAL_length_bias 0x00000002
+#define GEN9_MI_SEMAPHORE_SIGNAL_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 27,                  \
+   .DwordLength          =  0
+
+#define GEN9_MI_SEMAPHORE_SIGNAL_length 0x00000002
+
+struct GEN9_MI_SEMAPHORE_SIGNAL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     PostSyncOperation;
+#define     RCS                                                0
+#define     VCS0                                               1
+#define     BCS                                                2
+#define     VECS                                               3
+#define     VCS1                                               4
+   uint32_t                                     TargetEngineSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     TargetContextID;
+};
+
+static inline void
+GEN9_MI_SEMAPHORE_SIGNAL_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_MI_SEMAPHORE_SIGNAL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->PostSyncOperation, 21, 21) |
+      __gen_field(values->TargetEngineSelect, 15, 17) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->TargetContextID, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MI_SEMAPHORE_WAIT_length_bias 0x00000002
+#define GEN9_MI_SEMAPHORE_WAIT_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 28,                  \
+   .DwordLength          =  2
+
+#define GEN9_MI_SEMAPHORE_WAIT_length 0x00000004
+
+struct GEN9_MI_SEMAPHORE_WAIT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     MemoryType;
+#define     PollingMode                                        1
+#define     SignalMode                                         0
+   uint32_t                                     WaitMode;
+#define     SAD_GREATER_THAN_SDD                               0
+#define     SAD_GREATER_THAN_OR_EQUAL_SDD                      1
+#define     SAD_LESS_THAN_SDD                                  2
+#define     SAD_LESS_THAN_OR_EQUAL_SDD                         3
+#define     SAD_EQUAL_SDD                                      4
+#define     SAD_NOT_EQUAL_SDD                                  5
+   uint32_t                                     CompareOperation;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SemaphoreDataDword;
+   __gen_address_type                           SemaphoreAddress;
+};
+
+static inline void
+GEN9_MI_SEMAPHORE_WAIT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_MI_SEMAPHORE_WAIT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->MemoryType, 22, 22) |
+      __gen_field(values->WaitMode, 15, 15) |
+      __gen_field(values->CompareOperation, 12, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SemaphoreDataDword, 0, 31) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SemaphoreAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN9_MI_STORE_DATA_IMM_length_bias 0x00000002
+#define GEN9_MI_STORE_DATA_IMM_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 32,                  \
+   .DwordLength          =  2
+
+#define GEN9_MI_STORE_DATA_IMM_length 0x00000004
+
+struct GEN9_MI_STORE_DATA_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   bool                                         StoreQword;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           Address;
+   uint32_t                                     CoreModeEnable;
+   uint32_t                                     DataDWord0;
+   uint32_t                                     DataDWord1;
+};
+
+static inline void
+GEN9_MI_STORE_DATA_IMM_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_MI_STORE_DATA_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->StoreQword, 21, 21) |
+      __gen_field(values->DwordLength, 0, 9) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->CoreModeEnable, 0, 0) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->Address, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->DataDWord1, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MI_STORE_REGISTER_MEM_length_bias 0x00000002
+#define GEN9_MI_STORE_REGISTER_MEM_header       \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 36,                  \
+   .DwordLength          =  2
+
+#define GEN9_MI_STORE_REGISTER_MEM_length 0x00000004
+
+struct GEN9_MI_STORE_REGISTER_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN9_MI_STORE_REGISTER_MEM_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN9_MI_STORE_REGISTER_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->PredicateEnable, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterAddress, 2, 22) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN9_PIPELINE_SELECT_length_bias 0x00000001
+#define GEN9_PIPELINE_SELECT_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  1,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4
+
+#define GEN9_PIPELINE_SELECT_length 0x00000001
+
+struct GEN9_PIPELINE_SELECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     MaskBits;
+   uint32_t                                     ForceMediaAwake;
+   uint32_t                                     MediaSamplerDOPClockGateEnable;
+#define     _3D                                                0
+#define     Media                                              1
+#define     GPGPU                                              2
+   uint32_t                                     PipelineSelection;
+};
+
+static inline void
+GEN9_PIPELINE_SELECT_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN9_PIPELINE_SELECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->MaskBits, 8, 15) |
+      __gen_field(values->ForceMediaAwake, 5, 5) |
+      __gen_field(values->MediaSamplerDOPClockGateEnable, 4, 4) |
+      __gen_field(values->PipelineSelection, 0, 1) |
+      0;
+
+}
+
+#define GEN9_STATE_BASE_ADDRESS_length_bias 0x00000002
+#define GEN9_STATE_BASE_ADDRESS_header          \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  1,                  \
+   .DwordLength          = 17
+
+#define GEN9_STATE_BASE_ADDRESS_length 0x00000013
+
+#define GEN9_MEMORY_OBJECT_CONTROL_STATE_length 0x00000001
+
+struct GEN9_MEMORY_OBJECT_CONTROL_STATE {
+   uint32_t                                     IndextoMOCSTables;
+};
+
+static inline void
+GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN9_MEMORY_OBJECT_CONTROL_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->IndextoMOCSTables, 1, 6) |
+      0;
+
+}
+
+struct GEN9_STATE_BASE_ADDRESS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GeneralStateBaseAddress;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      GeneralStateMemoryObjectControlState;
+   bool                                         GeneralStateBaseAddressModifyEnable;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      StatelessDataPortAccessMemoryObjectControlState;
+   __gen_address_type                           SurfaceStateBaseAddress;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      SurfaceStateMemoryObjectControlState;
+   bool                                         SurfaceStateBaseAddressModifyEnable;
+   __gen_address_type                           DynamicStateBaseAddress;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      DynamicStateMemoryObjectControlState;
+   bool                                         DynamicStateBaseAddressModifyEnable;
+   __gen_address_type                           IndirectObjectBaseAddress;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      IndirectObjectMemoryObjectControlState;
+   bool                                         IndirectObjectBaseAddressModifyEnable;
+   __gen_address_type                           InstructionBaseAddress;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      InstructionMemoryObjectControlState;
+   bool                                         InstructionBaseAddressModifyEnable;
+   uint32_t                                     GeneralStateBufferSize;
+   bool                                         GeneralStateBufferSizeModifyEnable;
+   uint32_t                                     DynamicStateBufferSize;
+   bool                                         DynamicStateBufferSizeModifyEnable;
+   uint32_t                                     IndirectObjectBufferSize;
+   bool                                         IndirectObjectBufferSizeModifyEnable;
+   uint32_t                                     InstructionBufferSize;
+   bool                                         InstructionBuffersizeModifyEnable;
+   __gen_address_type                           BindlessSurfaceStateBaseAddress;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      BindlessSurfaceStateMemoryObjectControlState;
+   bool                                         BindlessSurfaceStateBaseAddressModifyEnable;
+   uint32_t                                     BindlessSurfaceStateSize;
+};
+
+static inline void
+GEN9_STATE_BASE_ADDRESS_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN9_STATE_BASE_ADDRESS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_GeneralStateMemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_GeneralStateMemoryObjectControlState, &values->GeneralStateMemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(dw_GeneralStateMemoryObjectControlState, 4, 10) |
+      __gen_field(values->GeneralStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->GeneralStateBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   uint32_t dw_StatelessDataPortAccessMemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_StatelessDataPortAccessMemoryObjectControlState, &values->StatelessDataPortAccessMemoryObjectControlState);
+   dw[3] =
+      __gen_field(dw_StatelessDataPortAccessMemoryObjectControlState, 16, 22) |
+      0;
+
+   uint32_t dw_SurfaceStateMemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceStateMemoryObjectControlState, &values->SurfaceStateMemoryObjectControlState);
+   uint32_t dw4 =
+      __gen_field(dw_SurfaceStateMemoryObjectControlState, 4, 10) |
+      __gen_field(values->SurfaceStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw4 =
+      __gen_combine_address(data, &dw[4], values->SurfaceStateBaseAddress, dw4);
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   uint32_t dw_DynamicStateMemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_DynamicStateMemoryObjectControlState, &values->DynamicStateMemoryObjectControlState);
+   uint32_t dw6 =
+      __gen_field(dw_DynamicStateMemoryObjectControlState, 4, 10) |
+      __gen_field(values->DynamicStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw6 =
+      __gen_combine_address(data, &dw[6], values->DynamicStateBaseAddress, dw6);
+
+   dw[6] = qw6;
+   dw[7] = qw6 >> 32;
+
+   uint32_t dw_IndirectObjectMemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_IndirectObjectMemoryObjectControlState, &values->IndirectObjectMemoryObjectControlState);
+   uint32_t dw8 =
+      __gen_field(dw_IndirectObjectMemoryObjectControlState, 4, 10) |
+      __gen_field(values->IndirectObjectBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw8 =
+      __gen_combine_address(data, &dw[8], values->IndirectObjectBaseAddress, dw8);
+
+   dw[8] = qw8;
+   dw[9] = qw8 >> 32;
+
+   uint32_t dw_InstructionMemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_InstructionMemoryObjectControlState, &values->InstructionMemoryObjectControlState);
+   uint32_t dw10 =
+      __gen_field(dw_InstructionMemoryObjectControlState, 4, 10) |
+      __gen_field(values->InstructionBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw10 =
+      __gen_combine_address(data, &dw[10], values->InstructionBaseAddress, dw10);
+
+   dw[10] = qw10;
+   dw[11] = qw10 >> 32;
+
+   dw[12] =
+      __gen_field(values->GeneralStateBufferSize, 12, 31) |
+      __gen_field(values->GeneralStateBufferSizeModifyEnable, 0, 0) |
+      0;
+
+   dw[13] =
+      __gen_field(values->DynamicStateBufferSize, 12, 31) |
+      __gen_field(values->DynamicStateBufferSizeModifyEnable, 0, 0) |
+      0;
+
+   dw[14] =
+      __gen_field(values->IndirectObjectBufferSize, 12, 31) |
+      __gen_field(values->IndirectObjectBufferSizeModifyEnable, 0, 0) |
+      0;
+
+   dw[15] =
+      __gen_field(values->InstructionBufferSize, 12, 31) |
+      __gen_field(values->InstructionBuffersizeModifyEnable, 0, 0) |
+      0;
+
+   uint32_t dw_BindlessSurfaceStateMemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_BindlessSurfaceStateMemoryObjectControlState, &values->BindlessSurfaceStateMemoryObjectControlState);
+   uint32_t dw16 =
+      __gen_field(dw_BindlessSurfaceStateMemoryObjectControlState, 4, 10) |
+      __gen_field(values->BindlessSurfaceStateBaseAddressModifyEnable, 0, 0) |
+      0;
+
+   uint64_t qw16 =
+      __gen_combine_address(data, &dw[16], values->BindlessSurfaceStateBaseAddress, dw16);
+
+   dw[16] = qw16;
+   dw[17] = qw16 >> 32;
+
+   dw[18] =
+      __gen_field(values->BindlessSurfaceStateSize, 12, 31) |
+      0;
+
+}
+
+#define GEN9_STATE_PREFETCH_length_bias 0x00000002
+#define GEN9_STATE_PREFETCH_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  3,                  \
+   .DwordLength          =  0
+
+#define GEN9_STATE_PREFETCH_length 0x00000002
+
+struct GEN9_STATE_PREFETCH {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           PrefetchPointer;
+   uint32_t                                     PrefetchCount;
+};
+
+static inline void
+GEN9_STATE_PREFETCH_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN9_STATE_PREFETCH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->PrefetchCount, 0, 2) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->PrefetchPointer, dw1);
+
+}
+
+#define GEN9_STATE_SIP_length_bias 0x00000002
+#define GEN9_STATE_SIP_header                   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  0,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  2,                  \
+   .DwordLength          =  1
+
+#define GEN9_STATE_SIP_length 0x00000003
+
+struct GEN9_STATE_SIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     SystemInstructionPointer;
+};
+
+static inline void
+GEN9_STATE_SIP_pack(__gen_user_data *data, void * restrict dst,
+                    const struct GEN9_STATE_SIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->SystemInstructionPointer, 4, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+}
+
+#define GEN9_3DPRIMITIVE_length_bias 0x00000002
+#define GEN9_3DPRIMITIVE_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  3,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  5
+
+#define GEN9_3DPRIMITIVE_length 0x00000007
+
+struct GEN9_3DPRIMITIVE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         IndirectParameterEnable;
+   uint32_t                                     UAVCoherencyRequired;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   bool                                         EndOffsetEnable;
+#define     SEQUENTIAL                                         0
+#define     RANDOM                                             1
+   uint32_t                                     VertexAccessType;
+   uint32_t                                     PrimitiveTopologyType;
+   uint32_t                                     VertexCountPerInstance;
+   uint32_t                                     StartVertexLocation;
+   uint32_t                                     InstanceCount;
+   uint32_t                                     StartInstanceLocation;
+   uint32_t                                     BaseVertexLocation;
+};
+
+static inline void
+GEN9_3DPRIMITIVE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN9_3DPRIMITIVE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->IndirectParameterEnable, 10, 10) |
+      __gen_field(values->UAVCoherencyRequired, 9, 9) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->EndOffsetEnable, 9, 9) |
+      __gen_field(values->VertexAccessType, 8, 8) |
+      __gen_field(values->PrimitiveTopologyType, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->VertexCountPerInstance, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->StartVertexLocation, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->InstanceCount, 0, 31) |
+      0;
+
+   dw[5] =
+      __gen_field(values->StartInstanceLocation, 0, 31) |
+      0;
+
+   dw[6] =
+      __gen_field(values->BaseVertexLocation, 0, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_AA_LINE_PARAMETERS_length_bias 0x00000002
+#define GEN9_3DSTATE_AA_LINE_PARAMETERS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 10,                  \
+   .DwordLength          =  1
+
+#define GEN9_3DSTATE_AA_LINE_PARAMETERS_length 0x00000003
+
+struct GEN9_3DSTATE_AA_LINE_PARAMETERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        AAPointCoverageBias;
+   float                                        AACoverageBias;
+   float                                        AAPointCoverageSlope;
+   float                                        AACoverageSlope;
+   float                                        AAPointCoverageEndCapBias;
+   float                                        AACoverageEndCapBias;
+   float                                        AAPointCoverageEndCapSlope;
+   float                                        AACoverageEndCapSlope;
+};
+
+static inline void
+GEN9_3DSTATE_AA_LINE_PARAMETERS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN9_3DSTATE_AA_LINE_PARAMETERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AAPointCoverageBias * (1 << 8), 24, 31) |
+      __gen_field(values->AACoverageBias * (1 << 8), 16, 23) |
+      __gen_field(values->AAPointCoverageSlope * (1 << 8), 8, 15) |
+      __gen_field(values->AACoverageSlope * (1 << 8), 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->AAPointCoverageEndCapBias * (1 << 8), 24, 31) |
+      __gen_field(values->AACoverageEndCapBias * (1 << 8), 16, 23) |
+      __gen_field(values->AAPointCoverageEndCapSlope * (1 << 8), 8, 15) |
+      __gen_field(values->AACoverageEndCapSlope * (1 << 8), 0, 7) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_DS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 70
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_DS_length 0x00000000
+
+#define GEN9_BINDING_TABLE_EDIT_ENTRY_length 0x00000001
+
+struct GEN9_BINDING_TABLE_EDIT_ENTRY {
+   uint32_t                                     BindingTableIndex;
+   uint32_t                                     SurfaceStatePointer;
+};
+
+static inline void
+GEN9_BINDING_TABLE_EDIT_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                                   const struct GEN9_BINDING_TABLE_EDIT_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->BindingTableIndex, 16, 23) |
+      __gen_offset(values->SurfaceStatePointer, 0, 15) |
+      0;
+
+}
+
+struct GEN9_3DSTATE_BINDING_TABLE_EDIT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_EDIT_DS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN9_3DSTATE_BINDING_TABLE_EDIT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_GS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 68
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_GS_length 0x00000000
+
+struct GEN9_3DSTATE_BINDING_TABLE_EDIT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_EDIT_GS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN9_3DSTATE_BINDING_TABLE_EDIT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_HS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 69
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_HS_length 0x00000000
+
+struct GEN9_3DSTATE_BINDING_TABLE_EDIT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_EDIT_HS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN9_3DSTATE_BINDING_TABLE_EDIT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_PS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 71
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_PS_length 0x00000000
+
+struct GEN9_3DSTATE_BINDING_TABLE_EDIT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_EDIT_PS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN9_3DSTATE_BINDING_TABLE_EDIT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_VS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 67
+
+#define GEN9_3DSTATE_BINDING_TABLE_EDIT_VS_length 0x00000000
+
+struct GEN9_3DSTATE_BINDING_TABLE_EDIT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BindingTableBlockClear;
+#define     AllCores                                           3
+#define     Core1                                              2
+#define     Core0                                              1
+   uint32_t                                     BindingTableEditTarget;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_EDIT_VS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN9_3DSTATE_BINDING_TABLE_EDIT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->BindingTableBlockClear, 16, 31) |
+      __gen_field(values->BindingTableEditTarget, 0, 1) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_DS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 40,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_DS_length 0x00000002
+
+struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDSBindingTable;
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_POINTERS_DS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_GS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 41,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_GS_length 0x00000002
+
+struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoGSBindingTable;
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_POINTERS_GS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoGSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_HS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 39,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_HS_length 0x00000002
+
+struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoHSBindingTable;
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_POINTERS_HS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoHSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_PS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 42,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_PS_length 0x00000002
+
+struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoPSBindingTable;
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_POINTERS_PS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoPSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_VS_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 38,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_VS_length 0x00000002
+
+struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoVSBindingTable;
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_POINTERS_VS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_BINDING_TABLE_POINTERS_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoVSBindingTable, 5, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_BINDING_TABLE_POOL_ALLOC_length_bias 0x00000002
+#define GEN9_3DSTATE_BINDING_TABLE_POOL_ALLOC_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 25,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_BINDING_TABLE_POOL_ALLOC_length 0x00000004
+
+struct GEN9_3DSTATE_BINDING_TABLE_POOL_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           BindingTablePoolBaseAddress;
+   uint32_t                                     BindingTablePoolEnable;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      SurfaceObjectControlState;
+#define     NoValidData                                        0
+   uint32_t                                     BindingTablePoolBufferSize;
+};
+
+static inline void
+GEN9_3DSTATE_BINDING_TABLE_POOL_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                                           const struct GEN9_3DSTATE_BINDING_TABLE_POOL_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SurfaceObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SurfaceObjectControlState, &values->SurfaceObjectControlState);
+   uint32_t dw1 =
+      __gen_field(values->BindingTablePoolEnable, 11, 11) |
+      __gen_field(dw_SurfaceObjectControlState, 0, 6) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->BindingTablePoolBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->BindingTablePoolBufferSize, 12, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_BLEND_STATE_POINTERS_length_bias 0x00000002
+#define GEN9_3DSTATE_BLEND_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 36,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_BLEND_STATE_POINTERS_length 0x00000002
+
+struct GEN9_3DSTATE_BLEND_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     BlendStatePointer;
+   bool                                         BlendStatePointerValid;
+};
+
+static inline void
+GEN9_3DSTATE_BLEND_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN9_3DSTATE_BLEND_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->BlendStatePointer, 6, 31) |
+      __gen_field(values->BlendStatePointerValid, 0, 0) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_CC_STATE_POINTERS_length_bias 0x00000002
+#define GEN9_3DSTATE_CC_STATE_POINTERS_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 14,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_CC_STATE_POINTERS_length 0x00000002
+
+struct GEN9_3DSTATE_CC_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ColorCalcStatePointer;
+   bool                                         ColorCalcStatePointerValid;
+};
+
+static inline void
+GEN9_3DSTATE_CC_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN9_3DSTATE_CC_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ColorCalcStatePointer, 6, 31) |
+      __gen_field(values->ColorCalcStatePointerValid, 0, 0) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_CHROMA_KEY_length_bias 0x00000002
+#define GEN9_3DSTATE_CHROMA_KEY_header          \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_CHROMA_KEY_length 0x00000004
+
+struct GEN9_3DSTATE_CHROMA_KEY {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ChromaKeyTableIndex;
+   uint32_t                                     ChromaKeyLowValue;
+   uint32_t                                     ChromaKeyHighValue;
+};
+
+static inline void
+GEN9_3DSTATE_CHROMA_KEY_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN9_3DSTATE_CHROMA_KEY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ChromaKeyTableIndex, 30, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChromaKeyLowValue, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ChromaKeyHighValue, 0, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_CLEAR_PARAMS_length_bias 0x00000002
+#define GEN9_3DSTATE_CLEAR_PARAMS_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  4,                  \
+   .DwordLength          =  1
+
+#define GEN9_3DSTATE_CLEAR_PARAMS_length 0x00000003
+
+struct GEN9_3DSTATE_CLEAR_PARAMS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        DepthClearValue;
+   bool                                         DepthClearValueValid;
+};
+
+static inline void
+GEN9_3DSTATE_CLEAR_PARAMS_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_3DSTATE_CLEAR_PARAMS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_float(values->DepthClearValue) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DepthClearValueValid, 0, 0) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_CLIP_length_bias 0x00000002
+#define GEN9_3DSTATE_CLIP_header                \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 18,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_CLIP_length 0x00000004
+
+struct GEN9_3DSTATE_CLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     Normal                                             0
+#define     Force                                              1
+   bool                                         ForceUserClipDistanceCullTestEnableBitmask;
+#define     _8Bit                                              0
+#define     _4Bit                                              1
+   uint32_t                                     VertexSubPixelPrecisionSelect;
+   bool                                         EarlyCullEnable;
+#define     Normal                                             0
+#define     Force                                              1
+   bool                                         ForceUserClipDistanceClipTestEnableBitmask;
+#define     Normal                                             0
+#define     Force                                              1
+   bool                                         ForceClipMode;
+   bool                                         ClipperStatisticsEnable;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+   bool                                         ClipEnable;
+#define     API_OGL                                            0
+   uint32_t                                     APIMode;
+   bool                                         ViewportXYClipTestEnable;
+   bool                                         GuardbandClipTestEnable;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+#define     NORMAL                                             0
+#define     REJECT_ALL                                         3
+#define     ACCEPT_ALL                                         4
+   uint32_t                                     ClipMode;
+   bool                                         PerspectiveDivideDisable;
+   bool                                         NonPerspectiveBarycentricEnable;
+   uint32_t                                     TriangleStripListProvokingVertexSelect;
+   uint32_t                                     LineStripListProvokingVertexSelect;
+   uint32_t                                     TriangleFanProvokingVertexSelect;
+   float                                        MinimumPointWidth;
+   float                                        MaximumPointWidth;
+   bool                                         ForceZeroRTAIndexEnable;
+   uint32_t                                     MaximumVPIndex;
+};
+
+static inline void
+GEN9_3DSTATE_CLIP_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN9_3DSTATE_CLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ForceUserClipDistanceCullTestEnableBitmask, 20, 20) |
+      __gen_field(values->VertexSubPixelPrecisionSelect, 19, 19) |
+      __gen_field(values->EarlyCullEnable, 18, 18) |
+      __gen_field(values->ForceUserClipDistanceClipTestEnableBitmask, 17, 17) |
+      __gen_field(values->ForceClipMode, 16, 16) |
+      __gen_field(values->ClipperStatisticsEnable, 10, 10) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClipEnable, 31, 31) |
+      __gen_field(values->APIMode, 30, 30) |
+      __gen_field(values->ViewportXYClipTestEnable, 28, 28) |
+      __gen_field(values->GuardbandClipTestEnable, 26, 26) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 16, 23) |
+      __gen_field(values->ClipMode, 13, 15) |
+      __gen_field(values->PerspectiveDivideDisable, 9, 9) |
+      __gen_field(values->NonPerspectiveBarycentricEnable, 8, 8) |
+      __gen_field(values->TriangleStripListProvokingVertexSelect, 4, 5) |
+      __gen_field(values->LineStripListProvokingVertexSelect, 2, 3) |
+      __gen_field(values->TriangleFanProvokingVertexSelect, 0, 1) |
+      0;
+
+   dw[3] =
+      __gen_field(values->MinimumPointWidth * (1 << 3), 17, 27) |
+      __gen_field(values->MaximumPointWidth * (1 << 3), 6, 16) |
+      __gen_field(values->ForceZeroRTAIndexEnable, 5, 5) |
+      __gen_field(values->MaximumVPIndex, 0, 3) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_CONSTANT_DS_length_bias 0x00000002
+#define GEN9_3DSTATE_CONSTANT_DS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 26,                  \
+   .DwordLength          =  9
+
+#define GEN9_3DSTATE_CONSTANT_DS_length 0x0000000b
+
+#define GEN9_3DSTATE_CONSTANT_BODY_length 0x0000000a
+
+struct GEN9_3DSTATE_CONSTANT_BODY {
+   uint32_t                                     ConstantBuffer1ReadLength;
+   uint32_t                                     ConstantBuffer0ReadLength;
+   uint32_t                                     ConstantBuffer3ReadLength;
+   uint32_t                                     ConstantBuffer2ReadLength;
+   __gen_address_type                           PointerToConstantBuffer0;
+   __gen_address_type                           PointerToConstantBuffer1;
+   __gen_address_type                           PointerToConstantBuffer2;
+   __gen_address_type                           PointerToConstantBuffer3;
+};
+
+static inline void
+GEN9_3DSTATE_CONSTANT_BODY_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN9_3DSTATE_CONSTANT_BODY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ConstantBuffer1ReadLength, 16, 31) |
+      __gen_field(values->ConstantBuffer0ReadLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBuffer3ReadLength, 16, 31) |
+      __gen_field(values->ConstantBuffer2ReadLength, 0, 15) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->PointerToConstantBuffer0, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   uint32_t dw4 =
+      0;
+
+   uint64_t qw4 =
+      __gen_combine_address(data, &dw[4], values->PointerToConstantBuffer1, dw4);
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   uint32_t dw6 =
+      0;
+
+   uint64_t qw6 =
+      __gen_combine_address(data, &dw[6], values->PointerToConstantBuffer2, dw6);
+
+   dw[6] = qw6;
+   dw[7] = qw6 >> 32;
+
+   uint32_t dw8 =
+      0;
+
+   uint64_t qw8 =
+      __gen_combine_address(data, &dw[8], values->PointerToConstantBuffer3, dw8);
+
+   dw[8] = qw8;
+   dw[9] = qw8 >> 32;
+
+}
+
+struct GEN9_3DSTATE_CONSTANT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN9_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN9_3DSTATE_CONSTANT_DS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_3DSTATE_CONSTANT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN9_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN9_3DSTATE_CONSTANT_GS_length_bias 0x00000002
+#define GEN9_3DSTATE_CONSTANT_GS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 22,                  \
+   .DwordLength          =  9
+
+#define GEN9_3DSTATE_CONSTANT_GS_length 0x0000000b
+
+struct GEN9_3DSTATE_CONSTANT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN9_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN9_3DSTATE_CONSTANT_GS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_3DSTATE_CONSTANT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN9_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN9_3DSTATE_CONSTANT_HS_length_bias 0x00000002
+#define GEN9_3DSTATE_CONSTANT_HS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 25,                  \
+   .DwordLength          =  9
+
+#define GEN9_3DSTATE_CONSTANT_HS_length 0x0000000b
+
+struct GEN9_3DSTATE_CONSTANT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN9_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN9_3DSTATE_CONSTANT_HS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_3DSTATE_CONSTANT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN9_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN9_3DSTATE_CONSTANT_PS_length_bias 0x00000002
+#define GEN9_3DSTATE_CONSTANT_PS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 23,                  \
+   .DwordLength          =  9
+
+#define GEN9_3DSTATE_CONSTANT_PS_length 0x0000000b
+
+struct GEN9_3DSTATE_CONSTANT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN9_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN9_3DSTATE_CONSTANT_PS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_3DSTATE_CONSTANT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN9_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN9_3DSTATE_CONSTANT_VS_length_bias 0x00000002
+#define GEN9_3DSTATE_CONSTANT_VS_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 21,                  \
+   .DwordLength          =  9
+
+#define GEN9_3DSTATE_CONSTANT_VS_length 0x0000000b
+
+struct GEN9_3DSTATE_CONSTANT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      ConstantBufferObjectControlState;
+   uint32_t                                     DwordLength;
+   struct GEN9_3DSTATE_CONSTANT_BODY            ConstantBody;
+};
+
+static inline void
+GEN9_3DSTATE_CONSTANT_VS_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_3DSTATE_CONSTANT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_ConstantBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_ConstantBufferObjectControlState, &values->ConstantBufferObjectControlState);
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(dw_ConstantBufferObjectControlState, 8, 14) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   GEN9_3DSTATE_CONSTANT_BODY_pack(data, &dw[1], &values->ConstantBody);
+}
+
+#define GEN9_3DSTATE_DEPTH_BUFFER_length_bias 0x00000002
+#define GEN9_3DSTATE_DEPTH_BUFFER_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  5,                  \
+   .DwordLength          =  6
+
+#define GEN9_3DSTATE_DEPTH_BUFFER_length 0x00000008
+
+struct GEN9_3DSTATE_DEPTH_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     SURFTYPE_2D                                        1
+#define     SURFTYPE_CUBE                                      3
+#define     SURFTYPE_NULL                                      7
+   uint32_t                                     SurfaceType;
+   bool                                         DepthWriteEnable;
+   bool                                         StencilWriteEnable;
+   bool                                         HierarchicalDepthBufferEnable;
+#define     D32_FLOAT                                          1
+#define     D24_UNORM_X8_UINT                                  3
+#define     D16_UNORM                                          5
+   uint32_t                                     SurfaceFormat;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     Height;
+   uint32_t                                     Width;
+   uint32_t                                     LOD;
+   uint32_t                                     Depth;
+   uint32_t                                     MinimumArrayElement;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      DepthBufferObjectControlState;
+#define     NONE                                               0
+#define     TILEYF                                             1
+#define     TILEYS                                             2
+   uint32_t                                     TiledResourceMode;
+   uint32_t                                     MipTailStartLOD;
+   uint32_t                                     RenderTargetViewExtent;
+   uint32_t                                     SurfaceQPitch;
+};
+
+static inline void
+GEN9_3DSTATE_DEPTH_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_3DSTATE_DEPTH_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SurfaceType, 29, 31) |
+      __gen_field(values->DepthWriteEnable, 28, 28) |
+      __gen_field(values->StencilWriteEnable, 27, 27) |
+      __gen_field(values->HierarchicalDepthBufferEnable, 22, 22) |
+      __gen_field(values->SurfaceFormat, 18, 20) |
+      __gen_field(values->SurfacePitch, 0, 17) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->Height, 18, 31) |
+      __gen_field(values->Width, 4, 17) |
+      __gen_field(values->LOD, 0, 3) |
+      0;
+
+   uint32_t dw_DepthBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_DepthBufferObjectControlState, &values->DepthBufferObjectControlState);
+   dw[5] =
+      __gen_field(values->Depth, 21, 31) |
+      __gen_field(values->MinimumArrayElement, 10, 20) |
+      __gen_field(dw_DepthBufferObjectControlState, 0, 6) |
+      0;
+
+   dw[6] =
+      __gen_field(values->TiledResourceMode, 30, 31) |
+      __gen_field(values->MipTailStartLOD, 26, 29) |
+      0;
+
+   dw[7] =
+      __gen_field(values->RenderTargetViewExtent, 21, 31) |
+      __gen_field(values->SurfaceQPitch, 0, 14) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_DRAWING_RECTANGLE_length_bias 0x00000002
+#define GEN9_3DSTATE_DRAWING_RECTANGLE_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_DRAWING_RECTANGLE_length 0x00000004
+
+struct GEN9_3DSTATE_DRAWING_RECTANGLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+#define     Legacy                                             0
+#define     Core0Enabled                                       1
+#define     Core1Enabled                                       2
+   uint32_t                                     CoreModeSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ClippedDrawingRectangleYMin;
+   uint32_t                                     ClippedDrawingRectangleXMin;
+   uint32_t                                     ClippedDrawingRectangleYMax;
+   uint32_t                                     ClippedDrawingRectangleXMax;
+   uint32_t                                     DrawingRectangleOriginY;
+   uint32_t                                     DrawingRectangleOriginX;
+};
+
+static inline void
+GEN9_3DSTATE_DRAWING_RECTANGLE_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN9_3DSTATE_DRAWING_RECTANGLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->CoreModeSelect, 14, 15) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ClippedDrawingRectangleYMin, 16, 31) |
+      __gen_field(values->ClippedDrawingRectangleXMin, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClippedDrawingRectangleYMax, 16, 31) |
+      __gen_field(values->ClippedDrawingRectangleXMax, 0, 15) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DrawingRectangleOriginY, 16, 31) |
+      __gen_field(values->DrawingRectangleOriginX, 0, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_DS_length_bias 0x00000002
+#define GEN9_3DSTATE_DS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 29,                  \
+   .DwordLength          =  9
+
+#define GEN9_3DSTATE_DS_length 0x0000000b
+
+struct GEN9_3DSTATE_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     KernelStartPointer;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         AccessesUAV;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     PatchURBEntryReadLength;
+   uint32_t                                     PatchURBEntryReadOffset;
+   uint32_t                                     MaximumNumberofThreads;
+   bool                                         StatisticsEnable;
+#define     SIMD4X2                                            0
+#define     SIMD8_SINGLE_PATCH                                 1
+#define     SIMD8_SINGLE_OR_DUAL_PATCH                         2
+   uint32_t                                     DispatchMode;
+   bool                                         ComputeWCoordinateEnable;
+   bool                                         CacheDisable;
+   bool                                         FunctionEnable;
+   uint32_t                                     VertexURBEntryOutputReadOffset;
+   uint32_t                                     VertexURBEntryOutputLength;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+   uint64_t                                     DUAL_PATCHKernelStartPointer;
+};
+
+static inline void
+GEN9_3DSTATE_DS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->KernelStartPointer, 6, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->AccessesUAV, 14, 14) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   uint64_t qw4 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   dw[6] =
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 20, 24) |
+      __gen_field(values->PatchURBEntryReadLength, 11, 17) |
+      __gen_field(values->PatchURBEntryReadOffset, 4, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->MaximumNumberofThreads, 21, 29) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->DispatchMode, 3, 4) |
+      __gen_field(values->ComputeWCoordinateEnable, 2, 2) |
+      __gen_field(values->CacheDisable, 1, 1) |
+      __gen_field(values->FunctionEnable, 0, 0) |
+      0;
+
+   dw[8] =
+      __gen_field(values->VertexURBEntryOutputReadOffset, 21, 26) |
+      __gen_field(values->VertexURBEntryOutputLength, 16, 20) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 8, 15) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+   uint64_t qw9 =
+      __gen_offset(values->DUAL_PATCHKernelStartPointer, 6, 63) |
+      0;
+
+   dw[9] = qw9;
+   dw[10] = qw9 >> 32;
+
+}
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_DS_length_bias 0x00000002
+#define GEN9_3DSTATE_GATHER_CONSTANT_DS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 55
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_DS_length 0x00000000
+
+#define GEN9_GATHER_CONSTANT_ENTRY_length 0x00000001
+
+struct GEN9_GATHER_CONSTANT_ENTRY {
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ChannelMask;
+   uint32_t                                     BindingTableIndexOffset;
+};
+
+static inline void
+GEN9_GATHER_CONSTANT_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN9_GATHER_CONSTANT_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->ConstantBufferOffset, 8, 15) |
+      __gen_field(values->ChannelMask, 4, 7) |
+      __gen_field(values->BindingTableIndexOffset, 0, 3) |
+      0;
+
+}
+
+struct GEN9_3DSTATE_GATHER_CONSTANT_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+#define     CommitGather                                       0
+#define     NonCommitGather                                    1
+   uint32_t                                     UpdateGatherTableOnly;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+#define     Load                                               0
+#define     Read                                               1
+   uint32_t                                     OnDieTable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_GATHER_CONSTANT_DS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN9_3DSTATE_GATHER_CONSTANT_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      __gen_field(values->UpdateGatherTableOnly, 1, 1) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      __gen_field(values->OnDieTable, 3, 3) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_GS_length_bias 0x00000002
+#define GEN9_3DSTATE_GATHER_CONSTANT_GS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 53
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_GS_length 0x00000000
+
+struct GEN9_3DSTATE_GATHER_CONSTANT_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+#define     CommitGather                                       0
+#define     NonCommitGather                                    1
+   uint32_t                                     UpdateGatherTableOnly;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+#define     Load                                               0
+#define     Read                                               1
+   uint32_t                                     OnDieTable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_GATHER_CONSTANT_GS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN9_3DSTATE_GATHER_CONSTANT_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      __gen_field(values->UpdateGatherTableOnly, 1, 1) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      __gen_field(values->OnDieTable, 3, 3) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_HS_length_bias 0x00000002
+#define GEN9_3DSTATE_GATHER_CONSTANT_HS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 54
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_HS_length 0x00000000
+
+struct GEN9_3DSTATE_GATHER_CONSTANT_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+#define     CommitGather                                       0
+#define     NonCommitGather                                    1
+   uint32_t                                     UpdateGatherTableOnly;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+#define     Load                                               0
+#define     Read                                               1
+   uint32_t                                     OnDieTable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_GATHER_CONSTANT_HS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN9_3DSTATE_GATHER_CONSTANT_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      __gen_field(values->UpdateGatherTableOnly, 1, 1) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      __gen_field(values->OnDieTable, 3, 3) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_PS_length_bias 0x00000002
+#define GEN9_3DSTATE_GATHER_CONSTANT_PS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 56
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_PS_length 0x00000000
+
+struct GEN9_3DSTATE_GATHER_CONSTANT_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+#define     CommitGather                                       0
+#define     NonCommitGather                                    1
+   uint32_t                                     UpdateGatherTableOnly;
+   bool                                         DX9OnDieRegisterReadEnable;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+   bool                                         ConstantBufferDx9Enable;
+#define     Load                                               0
+#define     Read                                               1
+   uint32_t                                     OnDieTable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_GATHER_CONSTANT_PS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN9_3DSTATE_GATHER_CONSTANT_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      __gen_field(values->UpdateGatherTableOnly, 1, 1) |
+      __gen_field(values->DX9OnDieRegisterReadEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      __gen_field(values->ConstantBufferDx9Enable, 4, 4) |
+      __gen_field(values->OnDieTable, 3, 3) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_VS_length_bias 0x00000002
+#define GEN9_3DSTATE_GATHER_CONSTANT_VS_header  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 52
+
+#define GEN9_3DSTATE_GATHER_CONSTANT_VS_length 0x00000000
+
+struct GEN9_3DSTATE_GATHER_CONSTANT_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferValid;
+   uint32_t                                     ConstantBufferBindingTableBlock;
+#define     CommitGather                                       0
+#define     NonCommitGather                                    1
+   uint32_t                                     UpdateGatherTableOnly;
+   bool                                         DX9OnDieRegisterReadEnable;
+   uint32_t                                     GatherBufferOffset;
+   bool                                         ConstantBufferDx9GenerateStall;
+   bool                                         ConstantBufferDx9Enable;
+#define     Load                                               0
+#define     Read                                               1
+   uint32_t                                     OnDieTable;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_GATHER_CONSTANT_VS_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN9_3DSTATE_GATHER_CONSTANT_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferValid, 16, 31) |
+      __gen_field(values->ConstantBufferBindingTableBlock, 12, 15) |
+      __gen_field(values->UpdateGatherTableOnly, 1, 1) |
+      __gen_field(values->DX9OnDieRegisterReadEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->GatherBufferOffset, 6, 22) |
+      __gen_field(values->ConstantBufferDx9GenerateStall, 5, 5) |
+      __gen_field(values->ConstantBufferDx9Enable, 4, 4) |
+      __gen_field(values->OnDieTable, 3, 3) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_GATHER_POOL_ALLOC_length_bias 0x00000002
+#define GEN9_3DSTATE_GATHER_POOL_ALLOC_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 26,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_GATHER_POOL_ALLOC_length 0x00000004
+
+struct GEN9_3DSTATE_GATHER_POOL_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           GatherPoolBaseAddress;
+   bool                                         GatherPoolEnable;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   uint32_t                                     GatherPoolBufferSize;
+};
+
+static inline void
+GEN9_3DSTATE_GATHER_POOL_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN9_3DSTATE_GATHER_POOL_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   uint32_t dw1 =
+      __gen_field(values->GatherPoolEnable, 11, 11) |
+      __gen_field(dw_MemoryObjectControlState, 0, 6) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->GatherPoolBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->GatherPoolBufferSize, 12, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_GS_length_bias 0x00000002
+#define GEN9_3DSTATE_GS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 17,                  \
+   .DwordLength          =  8
+
+#define GEN9_3DSTATE_GS_length 0x0000000a
+
+struct GEN9_3DSTATE_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     KernelStartPointer;
+   uint32_t                                     SingleProgramFlow;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         AccessesUAV;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     ExpectedVertexCount;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterForURBData54;
+   uint32_t                                     OutputVertexSize;
+   uint32_t                                     OutputTopology;
+   uint32_t                                     VertexURBEntryReadLength;
+   bool                                         IncludeVertexHandles;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+   uint32_t                                     ControlDataHeaderSize;
+   uint32_t                                     InstanceControl;
+   uint32_t                                     DefaultStreamId;
+#define     DispatchModeSingle                                 0
+#define     DispatchModeDualInstance                           1
+#define     DispatchModeDualObject                             2
+#define     DispatchModeSIMD8                                  3
+   uint32_t                                     DispatchMode;
+   bool                                         StatisticsEnable;
+   uint32_t                                     InvocationsIncrementValue;
+   bool                                         IncludePrimitiveID;
+   uint32_t                                     Hint;
+#define     LEADING                                            0
+#define     TRAILING                                           1
+   uint32_t                                     ReorderMode;
+   bool                                         DiscardAdjacency;
+   bool                                         Enable;
+#define     CUT                                                0
+#define     SID                                                1
+   uint32_t                                     ControlDataFormat;
+   bool                                         StaticOutput;
+   uint32_t                                     StaticOutputVertexCount;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     VertexURBEntryOutputReadOffset;
+   uint32_t                                     VertexURBEntryOutputLength;
+   uint32_t                                     UserClipDistanceClipTestEnableBitmask;
+   uint32_t                                     UserClipDistanceCullTestEnableBitmask;
+};
+
+static inline void
+GEN9_3DSTATE_GS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->KernelStartPointer, 6, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->SingleProgramFlow, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->AccessesUAV, 12, 12) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      __gen_field(values->ExpectedVertexCount, 0, 5) |
+      0;
+
+   uint64_t qw4 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   dw[6] =
+      __gen_field(values->DispatchGRFStartRegisterForURBData54, 29, 30) |
+      __gen_field(values->OutputVertexSize, 23, 28) |
+      __gen_field(values->OutputTopology, 17, 22) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->IncludeVertexHandles, 10, 10) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 0, 3) |
+      0;
+
+   dw[7] =
+      __gen_field(values->ControlDataHeaderSize, 20, 23) |
+      __gen_field(values->InstanceControl, 15, 19) |
+      __gen_field(values->DefaultStreamId, 13, 14) |
+      __gen_field(values->DispatchMode, 11, 12) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->InvocationsIncrementValue, 5, 9) |
+      __gen_field(values->IncludePrimitiveID, 4, 4) |
+      __gen_field(values->Hint, 3, 3) |
+      __gen_field(values->ReorderMode, 2, 2) |
+      __gen_field(values->DiscardAdjacency, 1, 1) |
+      __gen_field(values->Enable, 0, 0) |
+      0;
+
+   dw[8] =
+      __gen_field(values->ControlDataFormat, 31, 31) |
+      __gen_field(values->StaticOutput, 30, 30) |
+      __gen_field(values->StaticOutputVertexCount, 16, 26) |
+      __gen_field(values->MaximumNumberofThreads, 0, 8) |
+      0;
+
+   dw[9] =
+      __gen_field(values->VertexURBEntryOutputReadOffset, 21, 26) |
+      __gen_field(values->VertexURBEntryOutputLength, 16, 20) |
+      __gen_field(values->UserClipDistanceClipTestEnableBitmask, 8, 15) |
+      __gen_field(values->UserClipDistanceCullTestEnableBitmask, 0, 7) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_HIER_DEPTH_BUFFER_length_bias 0x00000002
+#define GEN9_3DSTATE_HIER_DEPTH_BUFFER_header   \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  7,                  \
+   .DwordLength          =  3
+
+#define GEN9_3DSTATE_HIER_DEPTH_BUFFER_length 0x00000005
+
+struct GEN9_3DSTATE_HIER_DEPTH_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      HierarchicalDepthBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     SurfaceQPitch;
+};
+
+static inline void
+GEN9_3DSTATE_HIER_DEPTH_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN9_3DSTATE_HIER_DEPTH_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_HierarchicalDepthBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_HierarchicalDepthBufferObjectControlState, &values->HierarchicalDepthBufferObjectControlState);
+   dw[1] =
+      __gen_field(dw_HierarchicalDepthBufferObjectControlState, 25, 31) |
+      __gen_field(values->SurfacePitch, 0, 16) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->SurfaceQPitch, 0, 14) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_HS_length_bias 0x00000002
+#define GEN9_3DSTATE_HS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 27,                  \
+   .DwordLength          =  7
+
+#define GEN9_3DSTATE_HS_length 0x00000009
+
+struct GEN9_3DSTATE_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   bool                                         Enable;
+   bool                                         StatisticsEnable;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     InstanceCount;
+   uint64_t                                     KernelStartPointer;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     DispatchGRFStartRegisterForURBData5;
+   bool                                         SingleProgramFlow;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+   bool                                         AccessesUAV;
+   bool                                         IncludeVertexHandles;
+   uint32_t                                     DispatchGRFStartRegisterForURBData;
+#define     SINGLE_PATCH                                       0
+#define     DUAL_PATCH                                         1
+#define     _8_PATCH                                           2
+   uint32_t                                     DispatchMode;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   bool                                         IncludePrimitiveID;
+};
+
+static inline void
+GEN9_3DSTATE_HS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->SoftwareExceptionEnable, 12, 12) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Enable, 31, 31) |
+      __gen_field(values->StatisticsEnable, 29, 29) |
+      __gen_field(values->MaximumNumberofThreads, 8, 16) |
+      __gen_field(values->InstanceCount, 0, 3) |
+      0;
+
+   uint64_t qw3 =
+      __gen_offset(values->KernelStartPointer, 6, 63) |
+      0;
+
+   dw[3] = qw3;
+   dw[4] = qw3 >> 32;
+
+   uint64_t qw5 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[5] = qw5;
+   dw[6] = qw5 >> 32;
+
+   dw[7] =
+      __gen_field(values->DispatchGRFStartRegisterForURBData5, 28, 28) |
+      __gen_field(values->SingleProgramFlow, 27, 27) |
+      __gen_field(values->VectorMaskEnable, 26, 26) |
+      __gen_field(values->AccessesUAV, 25, 25) |
+      __gen_field(values->IncludeVertexHandles, 24, 24) |
+      __gen_field(values->DispatchGRFStartRegisterForURBData, 19, 23) |
+      __gen_field(values->DispatchMode, 17, 18) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 16) |
+      __gen_field(values->VertexURBEntryReadOffset, 4, 9) |
+      __gen_field(values->IncludePrimitiveID, 0, 0) |
+      0;
+
+   dw[8] =
+      0;
+
+}
+
+#define GEN9_3DSTATE_INDEX_BUFFER_length_bias 0x00000002
+#define GEN9_3DSTATE_INDEX_BUFFER_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 10,                  \
+   .DwordLength          =  3
+
+#define GEN9_3DSTATE_INDEX_BUFFER_length 0x00000005
+
+struct GEN9_3DSTATE_INDEX_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     INDEX_BYTE                                         0
+#define     INDEX_WORD                                         1
+#define     INDEX_DWORD                                        2
+   uint32_t                                     IndexFormat;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   __gen_address_type                           BufferStartingAddress;
+   uint32_t                                     BufferSize;
+};
+
+static inline void
+GEN9_3DSTATE_INDEX_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_3DSTATE_INDEX_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   dw[1] =
+      __gen_field(values->IndexFormat, 8, 9) |
+      __gen_field(dw_MemoryObjectControlState, 0, 6) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->BufferStartingAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->BufferSize, 0, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_LINE_STIPPLE_length_bias 0x00000002
+#define GEN9_3DSTATE_LINE_STIPPLE_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  8,                  \
+   .DwordLength          =  1
+
+#define GEN9_3DSTATE_LINE_STIPPLE_length 0x00000003
+
+struct GEN9_3DSTATE_LINE_STIPPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ModifyEnableCurrentRepeatCounterCurrentStippleIndex;
+   uint32_t                                     CurrentRepeatCounter;
+   uint32_t                                     CurrentStippleIndex;
+   uint32_t                                     LineStipplePattern;
+   float                                        LineStippleInverseRepeatCount;
+   uint32_t                                     LineStippleRepeatCount;
+};
+
+static inline void
+GEN9_3DSTATE_LINE_STIPPLE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_3DSTATE_LINE_STIPPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ModifyEnableCurrentRepeatCounterCurrentStippleIndex, 31, 31) |
+      __gen_field(values->CurrentRepeatCounter, 21, 29) |
+      __gen_field(values->CurrentStippleIndex, 16, 19) |
+      __gen_field(values->LineStipplePattern, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->LineStippleInverseRepeatCount * (1 << 16), 15, 31) |
+      __gen_field(values->LineStippleRepeatCount, 0, 8) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_MONOFILTER_SIZE_length_bias 0x00000002
+#define GEN9_3DSTATE_MONOFILTER_SIZE_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 17,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_MONOFILTER_SIZE_length 0x00000002
+
+struct GEN9_3DSTATE_MONOFILTER_SIZE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     MonochromeFilterWidth;
+   uint32_t                                     MonochromeFilterHeight;
+};
+
+static inline void
+GEN9_3DSTATE_MONOFILTER_SIZE_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN9_3DSTATE_MONOFILTER_SIZE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MonochromeFilterWidth, 3, 5) |
+      __gen_field(values->MonochromeFilterHeight, 0, 2) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_MULTISAMPLE_length_bias 0x00000002
+#define GEN9_3DSTATE_MULTISAMPLE_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 13,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_MULTISAMPLE_length 0x00000002
+
+struct GEN9_3DSTATE_MULTISAMPLE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PixelPositionOffsetEnable;
+#define     CENTER                                             0
+#define     UL_CORNER                                          1
+   uint32_t                                     PixelLocation;
+   uint32_t                                     NumberofMultisamples;
+};
+
+static inline void
+GEN9_3DSTATE_MULTISAMPLE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_3DSTATE_MULTISAMPLE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PixelPositionOffsetEnable, 5, 5) |
+      __gen_field(values->PixelLocation, 4, 4) |
+      __gen_field(values->NumberofMultisamples, 1, 3) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_POLY_STIPPLE_OFFSET_length_bias 0x00000002
+#define GEN9_3DSTATE_POLY_STIPPLE_OFFSET_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  6,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_POLY_STIPPLE_OFFSET_length 0x00000002
+
+struct GEN9_3DSTATE_POLY_STIPPLE_OFFSET {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PolygonStippleXOffset;
+   uint32_t                                     PolygonStippleYOffset;
+};
+
+static inline void
+GEN9_3DSTATE_POLY_STIPPLE_OFFSET_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN9_3DSTATE_POLY_STIPPLE_OFFSET * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PolygonStippleXOffset, 8, 12) |
+      __gen_field(values->PolygonStippleYOffset, 0, 4) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_POLY_STIPPLE_PATTERN_length_bias 0x00000002
+#define GEN9_3DSTATE_POLY_STIPPLE_PATTERN_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  7,                  \
+   .DwordLength          = 31
+
+#define GEN9_3DSTATE_POLY_STIPPLE_PATTERN_length 0x00000021
+
+struct GEN9_3DSTATE_POLY_STIPPLE_PATTERN {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PatternRow[32];
+};
+
+static inline void
+GEN9_3DSTATE_POLY_STIPPLE_PATTERN_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN9_3DSTATE_POLY_STIPPLE_PATTERN * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 32; i += 1, j++) {
+      dw[j] =
+         __gen_field(values->PatternRow[i + 0], 0, 31) |
+         0;
+   }
+
+}
+
+#define GEN9_3DSTATE_PS_length_bias 0x00000002
+#define GEN9_3DSTATE_PS_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 32,                  \
+   .DwordLength          = 10
+
+#define GEN9_3DSTATE_PS_length 0x0000000c
+
+struct GEN9_3DSTATE_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint64_t                                     KernelStartPointer0;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleProgramFlow;
+#define     Dmask                                              0
+#define     Vmask                                              1
+   uint32_t                                     VectorMaskEnable;
+#define     NoSamplers                                         0
+#define     _14Samplers                                        1
+#define     _58Samplers                                        2
+#define     _912Samplers                                       3
+#define     _1316Samplers                                      4
+   uint32_t                                     SamplerCount;
+#define     FlushedtoZero                                      0
+#define     Retained                                           1
+   uint32_t                                     SinglePrecisionDenormalMode;
+   uint32_t                                     BindingTableEntryCount;
+#define     Normal                                             0
+#define     High                                               1
+   uint32_t                                     ThreadDispatchPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+#define     RTNE                                               0
+#define     RU                                                 1
+#define     RD                                                 2
+#define     RTZ                                                3
+   uint32_t                                     RoundingMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint64_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     MaximumNumberofThreadsPerPSD;
+   bool                                         PushConstantEnable;
+   bool                                         RenderTargetFastClearEnable;
+#define     RESOLVE_DISABLED                                   0
+#define     RESOLVE_PARTIAL                                    1
+#define     RESOLVE_FULL                                       3
+   uint32_t                                     RenderTargetResolveType;
+#define     POSOFFSET_NONE                                     0
+#define     POSOFFSET_CENTROID                                 2
+#define     POSOFFSET_SAMPLE                                   3
+   uint32_t                                     PositionXYOffsetSelect;
+   bool                                         _32PixelDispatchEnable;
+   bool                                         _16PixelDispatchEnable;
+   bool                                         _8PixelDispatchEnable;
+   uint32_t                                     DispatchGRFStartRegisterForConstantSetupData0;
+   uint32_t                                     DispatchGRFStartRegisterForConstantSetupData1;
+   uint32_t                                     DispatchGRFStartRegisterForConstantSetupData2;
+   uint64_t                                     KernelStartPointer1;
+   uint64_t                                     KernelStartPointer2;
+};
+
+static inline void
+GEN9_3DSTATE_PS_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint64_t qw1 =
+      __gen_offset(values->KernelStartPointer0, 6, 63) |
+      0;
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->SingleProgramFlow, 31, 31) |
+      __gen_field(values->VectorMaskEnable, 30, 30) |
+      __gen_field(values->SamplerCount, 27, 29) |
+      __gen_field(values->SinglePrecisionDenormalMode, 26, 26) |
+      __gen_field(values->BindingTableEntryCount, 18, 25) |
+      __gen_field(values->ThreadDispatchPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->RoundingMode, 14, 15) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   uint64_t qw4 =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 63) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+   dw[6] =
+      __gen_field(values->MaximumNumberofThreadsPerPSD, 23, 31) |
+      __gen_field(values->PushConstantEnable, 11, 11) |
+      __gen_field(values->RenderTargetFastClearEnable, 8, 8) |
+      __gen_field(values->RenderTargetResolveType, 6, 7) |
+      __gen_field(values->PositionXYOffsetSelect, 3, 4) |
+      __gen_field(values->_32PixelDispatchEnable, 2, 2) |
+      __gen_field(values->_16PixelDispatchEnable, 1, 1) |
+      __gen_field(values->_8PixelDispatchEnable, 0, 0) |
+      0;
+
+   dw[7] =
+      __gen_field(values->DispatchGRFStartRegisterForConstantSetupData0, 16, 22) |
+      __gen_field(values->DispatchGRFStartRegisterForConstantSetupData1, 8, 14) |
+      __gen_field(values->DispatchGRFStartRegisterForConstantSetupData2, 0, 6) |
+      0;
+
+   uint64_t qw8 =
+      __gen_offset(values->KernelStartPointer1, 6, 63) |
+      0;
+
+   dw[8] = qw8;
+   dw[9] = qw8 >> 32;
+
+   uint64_t qw10 =
+      __gen_offset(values->KernelStartPointer2, 6, 63) |
+      0;
+
+   dw[10] = qw10;
+   dw[11] = qw10 >> 32;
+
+}
+
+#define GEN9_3DSTATE_PS_BLEND_length_bias 0x00000002
+#define GEN9_3DSTATE_PS_BLEND_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 77,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_PS_BLEND_length 0x00000002
+
+struct GEN9_3DSTATE_PS_BLEND {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         AlphaToCoverageEnable;
+   bool                                         HasWriteableRT;
+   bool                                         ColorBufferBlendEnable;
+   uint32_t                                     SourceAlphaBlendFactor;
+   uint32_t                                     DestinationAlphaBlendFactor;
+   uint32_t                                     SourceBlendFactor;
+   uint32_t                                     DestinationBlendFactor;
+   bool                                         AlphaTestEnable;
+   bool                                         IndependentAlphaBlendEnable;
+};
+
+static inline void
+GEN9_3DSTATE_PS_BLEND_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_3DSTATE_PS_BLEND * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AlphaToCoverageEnable, 31, 31) |
+      __gen_field(values->HasWriteableRT, 30, 30) |
+      __gen_field(values->ColorBufferBlendEnable, 29, 29) |
+      __gen_field(values->SourceAlphaBlendFactor, 24, 28) |
+      __gen_field(values->DestinationAlphaBlendFactor, 19, 23) |
+      __gen_field(values->SourceBlendFactor, 14, 18) |
+      __gen_field(values->DestinationBlendFactor, 9, 13) |
+      __gen_field(values->AlphaTestEnable, 8, 8) |
+      __gen_field(values->IndependentAlphaBlendEnable, 7, 7) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_PS_EXTRA_length_bias 0x00000002
+#define GEN9_3DSTATE_PS_EXTRA_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 79,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_PS_EXTRA_length 0x00000002
+
+struct GEN9_3DSTATE_PS_EXTRA {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         PixelShaderValid;
+   bool                                         PixelShaderDoesnotwritetoRT;
+   bool                                         oMaskPresenttoRenderTarget;
+   bool                                         PixelShaderKillsPixel;
+#define     PSCDEPTH_OFF                                       0
+#define     PSCDEPTH_ON                                        1
+#define     PSCDEPTH_ON_GE                                     2
+#define     PSCDEPTH_ON_LE                                     3
+   uint32_t                                     PixelShaderComputedDepthMode;
+   bool                                         ForceComputedDepth;
+   bool                                         PixelShaderUsesSourceDepth;
+   bool                                         PixelShaderUsesSourceW;
+   uint32_t                                     Removed;
+   bool                                         AttributeEnable;
+   bool                                         PixelShaderDisablesAlphaToCoverage;
+   bool                                         PixelShaderIsPerSample;
+   bool                                         PixelShaderComputesStencil;
+   bool                                         PixelShaderPullsBary;
+   bool                                         PixelShaderHasUAV;
+#define     ICMS_NONE                                          0
+#define     ICMS_NORMAL                                        1
+#define     ICMS_INNER_CONSERVATIVE                            2
+#define     ICMS_DEPTH_COVERAGE                                3
+   uint32_t                                     InputCoverageMaskState;
+};
+
+static inline void
+GEN9_3DSTATE_PS_EXTRA_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_3DSTATE_PS_EXTRA * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PixelShaderValid, 31, 31) |
+      __gen_field(values->PixelShaderDoesnotwritetoRT, 30, 30) |
+      __gen_field(values->oMaskPresenttoRenderTarget, 29, 29) |
+      __gen_field(values->PixelShaderKillsPixel, 28, 28) |
+      __gen_field(values->PixelShaderComputedDepthMode, 26, 27) |
+      __gen_field(values->ForceComputedDepth, 25, 25) |
+      __gen_field(values->PixelShaderUsesSourceDepth, 24, 24) |
+      __gen_field(values->PixelShaderUsesSourceW, 23, 23) |
+      __gen_field(values->Removed, 17, 17) |
+      __gen_field(values->AttributeEnable, 8, 8) |
+      __gen_field(values->PixelShaderDisablesAlphaToCoverage, 7, 7) |
+      __gen_field(values->PixelShaderIsPerSample, 6, 6) |
+      __gen_field(values->PixelShaderComputesStencil, 5, 5) |
+      __gen_field(values->PixelShaderPullsBary, 3, 3) |
+      __gen_field(values->PixelShaderHasUAV, 2, 2) |
+      __gen_field(values->InputCoverageMaskState, 0, 1) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_DS_length_bias 0x00000002
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 20,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_DS_length 0x00000002
+
+struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_DS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_GS_length_bias 0x00000002
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 21,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_GS_length 0x00000002
+
+struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_GS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_HS_length_bias 0x00000002
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 19,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_HS_length 0x00000002
+
+struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_HS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_PS_length_bias 0x00000002
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 22,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_PS_length 0x00000002
+
+struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_PS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_VS_length_bias 0x00000002
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 18,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_VS_length 0x00000002
+
+struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ConstantBufferOffset;
+   uint32_t                                     ConstantBufferSize;
+};
+
+static inline void
+GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_VS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ConstantBufferOffset, 16, 20) |
+      __gen_field(values->ConstantBufferSize, 0, 5) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_RASTER_length_bias 0x00000002
+#define GEN9_3DSTATE_RASTER_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 80,                  \
+   .DwordLength          =  3
+
+#define GEN9_3DSTATE_RASTER_length 0x00000005
+
+struct GEN9_3DSTATE_RASTER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ViewportZFarClipTestEnable;
+   bool                                         ConservativeRasterizationEnable;
+#define     DX9OGL                                             0
+#define     DX100                                              1
+#define     DX101                                              2
+   uint32_t                                     APIMode;
+#define     Clockwise                                          0
+#define     CounterClockwise                                   1
+   uint32_t                                     FrontWinding;
+#define     FSC_NUMRASTSAMPLES_0                               0
+#define     FSC_NUMRASTSAMPLES_1                               1
+#define     FSC_NUMRASTSAMPLES_2                               2
+#define     FSC_NUMRASTSAMPLES_4                               3
+#define     FSC_NUMRASTSAMPLES_8                               4
+#define     FSC_NUMRASTSAMPLES_16                              5
+   uint32_t                                     ForcedSampleCount;
+#define     CULLMODE_BOTH                                      0
+#define     CULLMODE_NONE                                      1
+#define     CULLMODE_FRONT                                     2
+#define     CULLMODE_BACK                                      3
+   uint32_t                                     CullMode;
+#define     Normal                                             0
+#define     Force                                              1
+   uint32_t                                     ForceMultisampling;
+   bool                                         SmoothPointEnable;
+   bool                                         DXMultisampleRasterizationEnable;
+#define     MSRASTMODE_OFF_PIXEL                               0
+#define     MSRASTMODE_OFF_PATTERN                             1
+#define     MSRASTMODE_ON_PIXEL                                2
+#define     MSRASTMODE_ON_PATTERN                              3
+   uint32_t                                     DXMultisampleRasterizationMode;
+   bool                                         GlobalDepthOffsetEnableSolid;
+   bool                                         GlobalDepthOffsetEnableWireframe;
+   bool                                         GlobalDepthOffsetEnablePoint;
+#define     RASTER_SOLID                                       0
+#define     RASTER_WIREFRAME                                   1
+#define     RASTER_POINT                                       2
+   uint32_t                                     FrontFaceFillMode;
+#define     RASTER_SOLID                                       0
+#define     RASTER_WIREFRAME                                   1
+#define     RASTER_POINT                                       2
+   uint32_t                                     BackFaceFillMode;
+   bool                                         AntialiasingEnable;
+   bool                                         ScissorRectangleEnable;
+   bool                                         ViewportZNearClipTestEnable;
+   float                                        GlobalDepthOffsetConstant;
+   float                                        GlobalDepthOffsetScale;
+   float                                        GlobalDepthOffsetClamp;
+};
+
+static inline void
+GEN9_3DSTATE_RASTER_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN9_3DSTATE_RASTER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ViewportZFarClipTestEnable, 26, 26) |
+      __gen_field(values->ConservativeRasterizationEnable, 24, 24) |
+      __gen_field(values->APIMode, 22, 23) |
+      __gen_field(values->FrontWinding, 21, 21) |
+      __gen_field(values->ForcedSampleCount, 18, 20) |
+      __gen_field(values->CullMode, 16, 17) |
+      __gen_field(values->ForceMultisampling, 14, 14) |
+      __gen_field(values->SmoothPointEnable, 13, 13) |
+      __gen_field(values->DXMultisampleRasterizationEnable, 12, 12) |
+      __gen_field(values->DXMultisampleRasterizationMode, 10, 11) |
+      __gen_field(values->GlobalDepthOffsetEnableSolid, 9, 9) |
+      __gen_field(values->GlobalDepthOffsetEnableWireframe, 8, 8) |
+      __gen_field(values->GlobalDepthOffsetEnablePoint, 7, 7) |
+      __gen_field(values->FrontFaceFillMode, 5, 6) |
+      __gen_field(values->BackFaceFillMode, 3, 4) |
+      __gen_field(values->AntialiasingEnable, 2, 2) |
+      __gen_field(values->ScissorRectangleEnable, 1, 1) |
+      __gen_field(values->ViewportZNearClipTestEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_float(values->GlobalDepthOffsetConstant) |
+      0;
+
+   dw[3] =
+      __gen_float(values->GlobalDepthOffsetScale) |
+      0;
+
+   dw[4] =
+      __gen_float(values->GlobalDepthOffsetClamp) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_RS_CONSTANT_POINTER_length_bias 0x00000002
+#define GEN9_3DSTATE_RS_CONSTANT_POINTER_header \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 84,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_RS_CONSTANT_POINTER_length 0x00000004
+
+struct GEN9_3DSTATE_RS_CONSTANT_POINTER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     VS                                                 0
+#define     PS                                                 4
+   uint32_t                                     ShaderSelect;
+#define     RS_STORE                                           0
+#define     RS_LOAD                                            1
+   uint32_t                                     OperationLoadorStore;
+   __gen_address_type                           GlobalConstantBufferAddress;
+   __gen_address_type                           GlobalConstantBufferAddressHigh;
+};
+
+static inline void
+GEN9_3DSTATE_RS_CONSTANT_POINTER_pack(__gen_user_data *data, void * restrict dst,
+                                      const struct GEN9_3DSTATE_RS_CONSTANT_POINTER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ShaderSelect, 28, 30) |
+      __gen_field(values->OperationLoadorStore, 12, 12) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->GlobalConstantBufferAddress, dw2);
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->GlobalConstantBufferAddressHigh, dw3);
+
+}
+
+#define GEN9_3DSTATE_SAMPLER_PALETTE_LOAD0_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLER_PALETTE_LOAD0_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  =  2
+
+#define GEN9_3DSTATE_SAMPLER_PALETTE_LOAD0_length 0x00000000
+
+#define GEN9_PALETTE_ENTRY_length 0x00000001
+
+struct GEN9_PALETTE_ENTRY {
+   uint32_t                                     Alpha;
+   uint32_t                                     Red;
+   uint32_t                                     Green;
+   uint32_t                                     Blue;
+};
+
+static inline void
+GEN9_PALETTE_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN9_PALETTE_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->Alpha, 24, 31) |
+      __gen_field(values->Red, 16, 23) |
+      __gen_field(values->Green, 8, 15) |
+      __gen_field(values->Blue, 0, 7) |
+      0;
+
+}
+
+struct GEN9_3DSTATE_SAMPLER_PALETTE_LOAD0 {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLER_PALETTE_LOAD0_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN9_3DSTATE_SAMPLER_PALETTE_LOAD0 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_SAMPLER_PALETTE_LOAD1_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLER_PALETTE_LOAD1_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 12
+
+#define GEN9_3DSTATE_SAMPLER_PALETTE_LOAD1_length 0x00000000
+
+struct GEN9_3DSTATE_SAMPLER_PALETTE_LOAD1 {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLER_PALETTE_LOAD1_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN9_3DSTATE_SAMPLER_PALETTE_LOAD1 * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_DS_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_DS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 45,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_DS_length 0x00000002
+
+struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoDSSamplerState;
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLER_STATE_POINTERS_DS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoDSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_GS_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_GS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 46,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_GS_length 0x00000002
+
+struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoGSSamplerState;
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLER_STATE_POINTERS_GS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoGSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_HS_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_HS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 44,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_HS_length 0x00000002
+
+struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoHSSamplerState;
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLER_STATE_POINTERS_HS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoHSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_PS_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_PS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 47,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_PS_length 0x00000002
+
+struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_PS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoPSSamplerState;
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLER_STATE_POINTERS_PS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_PS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoPSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_VS_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_VS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 43,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_VS_length 0x00000002
+
+struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_VS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PointertoVSSamplerState;
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLER_STATE_POINTERS_VS_pack(__gen_user_data *data, void * restrict dst,
+                                            const struct GEN9_3DSTATE_SAMPLER_STATE_POINTERS_VS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->PointertoVSSamplerState, 5, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SAMPLE_MASK_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLE_MASK_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 24,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_SAMPLE_MASK_length 0x00000002
+
+struct GEN9_3DSTATE_SAMPLE_MASK {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SampleMask;
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLE_MASK_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_3DSTATE_SAMPLE_MASK * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SampleMask, 0, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SAMPLE_PATTERN_length_bias 0x00000002
+#define GEN9_3DSTATE_SAMPLE_PATTERN_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 28,                  \
+   .DwordLength          =  7
+
+#define GEN9_3DSTATE_SAMPLE_PATTERN_length 0x00000009
+
+struct GEN9_3DSTATE_SAMPLE_PATTERN {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        _16xSample3XOffset;
+   float                                        _16xSample3YOffset;
+   float                                        _16xSample2XOffset;
+   float                                        _16xSample2YOffset;
+   float                                        _16xSample1XOffset;
+   float                                        _16xSample1YOffset;
+   float                                        _16xSample0XOffset;
+   float                                        _16xSample0YOffset;
+   float                                        _16xSample7XOffset;
+   float                                        _16xSample7YOffset;
+   float                                        _16xSample6XOffset;
+   float                                        _16xSample6YOffset;
+   float                                        _16xSample5XOffset;
+   float                                        _16xSample5YOffset;
+   float                                        _16xSample4XOffset;
+   float                                        _16xSample4YOffset;
+   float                                        _16xSample11XOffset;
+   float                                        _16xSample11YOffset;
+   float                                        _16xSample10XOffset;
+   float                                        _16xSample10YOffset;
+   float                                        _16xSample9XOffset;
+   float                                        _16xSample9YOffset;
+   float                                        _16xSample8XOffset;
+   float                                        _16xSample8YOffset;
+   float                                        _16xSample15XOffset;
+   float                                        _16xSample15YOffset;
+   float                                        _16xSample14XOffset;
+   float                                        _16xSample14YOffset;
+   float                                        _16xSample13XOffset;
+   float                                        _16xSample13YOffset;
+   float                                        _16xSample12XOffset;
+   float                                        _16xSample12YOffset;
+   float                                        _8xSample7XOffset;
+   float                                        _8xSample7YOffset;
+   float                                        _8xSample6XOffset;
+   float                                        _8xSample6YOffset;
+   float                                        _8xSample5XOffset;
+   float                                        _8xSample5YOffset;
+   float                                        _8xSample4XOffset;
+   float                                        _8xSample4YOffset;
+   float                                        _8xSample3XOffset;
+   float                                        _8xSample3YOffset;
+   float                                        _8xSample2XOffset;
+   float                                        _8xSample2YOffset;
+   float                                        _8xSample1XOffset;
+   float                                        _8xSample1YOffset;
+   float                                        _8xSample0XOffset;
+   float                                        _8xSample0YOffset;
+   float                                        _4xSample3XOffset;
+   float                                        _4xSample3YOffset;
+   float                                        _4xSample2XOffset;
+   float                                        _4xSample2YOffset;
+   float                                        _4xSample1XOffset;
+   float                                        _4xSample1YOffset;
+   float                                        _4xSample0XOffset;
+   float                                        _4xSample0YOffset;
+   float                                        _1xSample0XOffset;
+   float                                        _1xSample0YOffset;
+   float                                        _2xSample1XOffset;
+   float                                        _2xSample1YOffset;
+   float                                        _2xSample0XOffset;
+   float                                        _2xSample0YOffset;
+};
+
+static inline void
+GEN9_3DSTATE_SAMPLE_PATTERN_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN9_3DSTATE_SAMPLE_PATTERN * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->_16xSample3XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_16xSample3YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_16xSample2XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_16xSample2YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_16xSample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_16xSample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_16xSample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_16xSample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->_16xSample7XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_16xSample7YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_16xSample6XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_16xSample6YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_16xSample5XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_16xSample5YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_16xSample4XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_16xSample4YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[3] =
+      __gen_field(values->_16xSample11XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_16xSample11YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_16xSample10XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_16xSample10YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_16xSample9XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_16xSample9YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_16xSample8XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_16xSample8YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->_16xSample15XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_16xSample15YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_16xSample14XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_16xSample14YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_16xSample13XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_16xSample13YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_16xSample12XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_16xSample12YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[5] =
+      __gen_field(values->_8xSample7XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_8xSample7YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_8xSample6XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_8xSample6YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_8xSample5XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_8xSample5YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_8xSample4XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_8xSample4YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[6] =
+      __gen_field(values->_8xSample3XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_8xSample3YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_8xSample2XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_8xSample2YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_8xSample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_8xSample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_8xSample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_8xSample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[7] =
+      __gen_field(values->_4xSample3XOffset * (1 << 4), 28, 31) |
+      __gen_field(values->_4xSample3YOffset * (1 << 4), 24, 27) |
+      __gen_field(values->_4xSample2XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_4xSample2YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_4xSample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_4xSample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_4xSample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_4xSample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+   dw[8] =
+      __gen_field(values->_1xSample0XOffset * (1 << 4), 20, 23) |
+      __gen_field(values->_1xSample0YOffset * (1 << 4), 16, 19) |
+      __gen_field(values->_2xSample1XOffset * (1 << 4), 12, 15) |
+      __gen_field(values->_2xSample1YOffset * (1 << 4), 8, 11) |
+      __gen_field(values->_2xSample0XOffset * (1 << 4), 4, 7) |
+      __gen_field(values->_2xSample0YOffset * (1 << 4), 0, 3) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SBE_length_bias 0x00000002
+#define GEN9_3DSTATE_SBE_header                 \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 31,                  \
+   .DwordLength          =  4
+
+#define GEN9_3DSTATE_SBE_length 0x00000006
+
+struct GEN9_3DSTATE_SBE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ForceVertexURBEntryReadLength;
+   bool                                         ForceVertexURBEntryReadOffset;
+   uint32_t                                     NumberofSFOutputAttributes;
+   bool                                         AttributeSwizzleEnable;
+#define     UPPERLEFT                                          0
+#define     LOWERLEFT                                          1
+   uint32_t                                     PointSpriteTextureCoordinateOrigin;
+   bool                                         PrimitiveIDOverrideComponentW;
+   bool                                         PrimitiveIDOverrideComponentZ;
+   bool                                         PrimitiveIDOverrideComponentY;
+   bool                                         PrimitiveIDOverrideComponentX;
+   uint32_t                                     VertexURBEntryReadLength;
+   uint32_t                                     VertexURBEntryReadOffset;
+   uint32_t                                     PrimitiveIDOverrideAttributeSelect;
+   uint32_t                                     PointSpriteTextureCoordinateEnable;
+   uint32_t                                     ConstantInterpolationEnable;
+   uint32_t                                     Attribute15ActiveComponentFormat;
+   uint32_t                                     Attribute14ActiveComponentFormat;
+   uint32_t                                     Attribute13ActiveComponentFormat;
+   uint32_t                                     Attribute12ActiveComponentFormat;
+   uint32_t                                     Attribute11ActiveComponentFormat;
+   uint32_t                                     Attribute10ActiveComponentFormat;
+   uint32_t                                     Attribute9ActiveComponentFormat;
+   uint32_t                                     Attribute8ActiveComponentFormat;
+   uint32_t                                     Attribute7ActiveComponentFormat;
+   uint32_t                                     Attribute6ActiveComponentFormat;
+   uint32_t                                     Attribute5ActiveComponentFormat;
+   uint32_t                                     Attribute4ActiveComponentFormat;
+   uint32_t                                     Attribute3ActiveComponentFormat;
+   uint32_t                                     Attribute2ActiveComponentFormat;
+   uint32_t                                     Attribute1ActiveComponentFormat;
+   uint32_t                                     Attribute0ActiveComponentFormat;
+   uint32_t                                     Attribute31ActiveComponentFormat;
+   uint32_t                                     Attribute30ActiveComponentFormat;
+   uint32_t                                     Attribute29ActiveComponentFormat;
+   uint32_t                                     Attribute28ActiveComponentFormat;
+   uint32_t                                     Attribute27ActiveComponentFormat;
+   uint32_t                                     Attribute26ActiveComponentFormat;
+   uint32_t                                     Attribute25ActiveComponentFormat;
+   uint32_t                                     Attribute24ActiveComponentFormat;
+   uint32_t                                     Attribute23ActiveComponentFormat;
+   uint32_t                                     Attribute22ActiveComponentFormat;
+   uint32_t                                     Attribute21ActiveComponentFormat;
+   uint32_t                                     Attribute20ActiveComponentFormat;
+   uint32_t                                     Attribute19ActiveComponentFormat;
+   uint32_t                                     Attribute18ActiveComponentFormat;
+   uint32_t                                     Attribute17ActiveComponentFormat;
+   uint32_t                                     Attribute16ActiveComponentFormat;
+};
+
+static inline void
+GEN9_3DSTATE_SBE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN9_3DSTATE_SBE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ForceVertexURBEntryReadLength, 29, 29) |
+      __gen_field(values->ForceVertexURBEntryReadOffset, 28, 28) |
+      __gen_field(values->NumberofSFOutputAttributes, 22, 27) |
+      __gen_field(values->AttributeSwizzleEnable, 21, 21) |
+      __gen_field(values->PointSpriteTextureCoordinateOrigin, 20, 20) |
+      __gen_field(values->PrimitiveIDOverrideComponentW, 19, 19) |
+      __gen_field(values->PrimitiveIDOverrideComponentZ, 18, 18) |
+      __gen_field(values->PrimitiveIDOverrideComponentY, 17, 17) |
+      __gen_field(values->PrimitiveIDOverrideComponentX, 16, 16) |
+      __gen_field(values->VertexURBEntryReadLength, 11, 15) |
+      __gen_field(values->VertexURBEntryReadOffset, 5, 10) |
+      __gen_field(values->PrimitiveIDOverrideAttributeSelect, 0, 4) |
+      0;
+
+   dw[2] =
+      __gen_field(values->PointSpriteTextureCoordinateEnable, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ConstantInterpolationEnable, 0, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->Attribute15ActiveComponentFormat, 30, 31) |
+      __gen_field(values->Attribute14ActiveComponentFormat, 28, 29) |
+      __gen_field(values->Attribute13ActiveComponentFormat, 26, 27) |
+      __gen_field(values->Attribute12ActiveComponentFormat, 24, 25) |
+      __gen_field(values->Attribute11ActiveComponentFormat, 22, 23) |
+      __gen_field(values->Attribute10ActiveComponentFormat, 20, 21) |
+      __gen_field(values->Attribute9ActiveComponentFormat, 18, 19) |
+      __gen_field(values->Attribute8ActiveComponentFormat, 16, 17) |
+      __gen_field(values->Attribute7ActiveComponentFormat, 14, 15) |
+      __gen_field(values->Attribute6ActiveComponentFormat, 12, 13) |
+      __gen_field(values->Attribute5ActiveComponentFormat, 10, 11) |
+      __gen_field(values->Attribute4ActiveComponentFormat, 8, 9) |
+      __gen_field(values->Attribute3ActiveComponentFormat, 6, 7) |
+      __gen_field(values->Attribute2ActiveComponentFormat, 4, 5) |
+      __gen_field(values->Attribute1ActiveComponentFormat, 2, 3) |
+      __gen_field(values->Attribute0ActiveComponentFormat, 0, 1) |
+      0;
+
+   dw[5] =
+      __gen_field(values->Attribute31ActiveComponentFormat, 30, 31) |
+      __gen_field(values->Attribute30ActiveComponentFormat, 28, 29) |
+      __gen_field(values->Attribute29ActiveComponentFormat, 26, 27) |
+      __gen_field(values->Attribute28ActiveComponentFormat, 24, 25) |
+      __gen_field(values->Attribute27ActiveComponentFormat, 22, 23) |
+      __gen_field(values->Attribute26ActiveComponentFormat, 20, 21) |
+      __gen_field(values->Attribute25ActiveComponentFormat, 18, 19) |
+      __gen_field(values->Attribute24ActiveComponentFormat, 16, 17) |
+      __gen_field(values->Attribute23ActiveComponentFormat, 14, 15) |
+      __gen_field(values->Attribute22ActiveComponentFormat, 12, 13) |
+      __gen_field(values->Attribute21ActiveComponentFormat, 10, 11) |
+      __gen_field(values->Attribute20ActiveComponentFormat, 8, 9) |
+      __gen_field(values->Attribute19ActiveComponentFormat, 6, 7) |
+      __gen_field(values->Attribute18ActiveComponentFormat, 4, 5) |
+      __gen_field(values->Attribute17ActiveComponentFormat, 2, 3) |
+      __gen_field(values->Attribute16ActiveComponentFormat, 0, 1) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SBE_SWIZ_length_bias 0x00000002
+#define GEN9_3DSTATE_SBE_SWIZ_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 81,                  \
+   .DwordLength          =  9
+
+#define GEN9_3DSTATE_SBE_SWIZ_length 0x0000000b
+
+#define GEN9_SF_OUTPUT_ATTRIBUTE_DETAIL_length 0x00000001
+
+struct GEN9_SF_OUTPUT_ATTRIBUTE_DETAIL {
+   bool                                         ComponentOverrideW;
+   bool                                         ComponentOverrideZ;
+   bool                                         ComponentOverrideY;
+   bool                                         ComponentOverrideX;
+   uint32_t                                     SwizzleControlMode;
+#define     CONST_0000                                         0
+#define     CONST_0001_FLOAT                                   1
+#define     CONST_1111_FLOAT                                   2
+#define     PRIM_ID                                            3
+   uint32_t                                     ConstantSource;
+#define     INPUTATTR                                          0
+#define     INPUTATTR_FACING                                   1
+#define     INPUTATTR_W                                        2
+#define     INPUTATTR_FACING_W                                 3
+   uint32_t                                     SwizzleSelect;
+   uint32_t                                     SourceAttribute;
+};
+
+static inline void
+GEN9_SF_OUTPUT_ATTRIBUTE_DETAIL_pack(__gen_user_data *data, void * restrict dst,
+                                     const struct GEN9_SF_OUTPUT_ATTRIBUTE_DETAIL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ComponentOverrideW, 15, 15) |
+      __gen_field(values->ComponentOverrideZ, 14, 14) |
+      __gen_field(values->ComponentOverrideY, 13, 13) |
+      __gen_field(values->ComponentOverrideX, 12, 12) |
+      __gen_field(values->SwizzleControlMode, 11, 11) |
+      __gen_field(values->ConstantSource, 9, 10) |
+      __gen_field(values->SwizzleSelect, 6, 7) |
+      __gen_field(values->SourceAttribute, 0, 4) |
+      0;
+
+}
+
+struct GEN9_3DSTATE_SBE_SWIZ {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   struct GEN9_SF_OUTPUT_ATTRIBUTE_DETAIL       Attribute[16];
+   uint32_t                                     AttributeWrapShortestEnables[16];
+};
+
+static inline void
+GEN9_3DSTATE_SBE_SWIZ_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_3DSTATE_SBE_SWIZ * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 16; i += 2, j++) {
+      uint32_t dw_Attribute0;
+      GEN9_SF_OUTPUT_ATTRIBUTE_DETAIL_pack(data, &dw_Attribute0, &values->Attribute[i + 0]);
+      uint32_t dw_Attribute1;
+      GEN9_SF_OUTPUT_ATTRIBUTE_DETAIL_pack(data, &dw_Attribute1, &values->Attribute[i + 1]);
+      dw[j] =
+         __gen_field(dw_Attribute0, 0, 15) |
+         __gen_field(dw_Attribute1, 16, 31) |
+         0;
+   }
+
+   for (uint32_t i = 0, j = 9; i < 16; i += 8, j++) {
+      dw[j] =
+         __gen_field(values->AttributeWrapShortestEnables[i + 0], 0, 3) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 1], 4, 7) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 2], 8, 11) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 3], 12, 15) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 4], 16, 19) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 5], 20, 23) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 6], 24, 27) |
+         __gen_field(values->AttributeWrapShortestEnables[i + 7], 28, 31) |
+         0;
+   }
+
+}
+
+#define GEN9_3DSTATE_SCISSOR_STATE_POINTERS_length_bias 0x00000002
+#define GEN9_3DSTATE_SCISSOR_STATE_POINTERS_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 15,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_SCISSOR_STATE_POINTERS_length 0x00000002
+
+struct GEN9_3DSTATE_SCISSOR_STATE_POINTERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ScissorRectPointer;
+};
+
+static inline void
+GEN9_3DSTATE_SCISSOR_STATE_POINTERS_pack(__gen_user_data *data, void * restrict dst,
+                                         const struct GEN9_3DSTATE_SCISSOR_STATE_POINTERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ScissorRectPointer, 5, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SF_length_bias 0x00000002
+#define GEN9_3DSTATE_SF_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 19,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_SF_length 0x00000004
+
+struct GEN9_3DSTATE_SF {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   float                                        LineWidth;
+   bool                                         LegacyGlobalDepthBiasEnable;
+   bool                                         StatisticsEnable;
+   bool                                         ViewportTransformEnable;
+#define     _05pixels                                          0
+#define     _10pixels                                          1
+#define     _20pixels                                          2
+#define     _40pixels                                          3
+   uint32_t                                     LineEndCapAntialiasingRegionWidth;
+   bool                                         LastPixelEnable;
+   uint32_t                                     TriangleStripListProvokingVertexSelect;
+   uint32_t                                     LineStripListProvokingVertexSelect;
+   uint32_t                                     TriangleFanProvokingVertexSelect;
+#define     AALINEDISTANCE_TRUE                                1
+   uint32_t                                     AALineDistanceMode;
+   bool                                         SmoothPointEnable;
+   uint32_t                                     VertexSubPixelPrecisionSelect;
+#define     Vertex                                             0
+#define     State                                              1
+   uint32_t                                     PointWidthSource;
+   float                                        PointWidth;
+};
+
+static inline void
+GEN9_3DSTATE_SF_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_SF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->LineWidth * (1 << 7), 12, 29) |
+      __gen_field(values->LegacyGlobalDepthBiasEnable, 11, 11) |
+      __gen_field(values->StatisticsEnable, 10, 10) |
+      __gen_field(values->ViewportTransformEnable, 1, 1) |
+      0;
+
+   dw[2] =
+      __gen_field(values->LineEndCapAntialiasingRegionWidth, 16, 17) |
+      0;
+
+   dw[3] =
+      __gen_field(values->LastPixelEnable, 31, 31) |
+      __gen_field(values->TriangleStripListProvokingVertexSelect, 29, 30) |
+      __gen_field(values->LineStripListProvokingVertexSelect, 27, 28) |
+      __gen_field(values->TriangleFanProvokingVertexSelect, 25, 26) |
+      __gen_field(values->AALineDistanceMode, 14, 14) |
+      __gen_field(values->SmoothPointEnable, 13, 13) |
+      __gen_field(values->VertexSubPixelPrecisionSelect, 12, 12) |
+      __gen_field(values->PointWidthSource, 11, 11) |
+      __gen_field(values->PointWidth * (1 << 3), 0, 10) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SO_BUFFER_length_bias 0x00000002
+#define GEN9_3DSTATE_SO_BUFFER_header           \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 24,                  \
+   .DwordLength          =  6
+
+#define GEN9_3DSTATE_SO_BUFFER_length 0x00000008
+
+struct GEN9_3DSTATE_SO_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         SOBufferEnable;
+   uint32_t                                     SOBufferIndex;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      SOBufferObjectControlState;
+   bool                                         StreamOffsetWriteEnable;
+   bool                                         StreamOutputBufferOffsetAddressEnable;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     SurfaceSize;
+   __gen_address_type                           StreamOutputBufferOffsetAddress;
+   uint32_t                                     StreamOffset;
+};
+
+static inline void
+GEN9_3DSTATE_SO_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_3DSTATE_SO_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_SOBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_SOBufferObjectControlState, &values->SOBufferObjectControlState);
+   dw[1] =
+      __gen_field(values->SOBufferEnable, 31, 31) |
+      __gen_field(values->SOBufferIndex, 29, 30) |
+      __gen_field(dw_SOBufferObjectControlState, 22, 28) |
+      __gen_field(values->StreamOffsetWriteEnable, 21, 21) |
+      __gen_field(values->StreamOutputBufferOffsetAddressEnable, 20, 20) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->SurfaceSize, 0, 29) |
+      0;
+
+   uint32_t dw5 =
+      0;
+
+   uint64_t qw5 =
+      __gen_combine_address(data, &dw[5], values->StreamOutputBufferOffsetAddress, dw5);
+
+   dw[5] = qw5;
+   dw[6] = qw5 >> 32;
+
+   dw[7] =
+      __gen_field(values->StreamOffset, 0, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_SO_DECL_LIST_length_bias 0x00000002
+#define GEN9_3DSTATE_SO_DECL_LIST_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 23
+
+#define GEN9_3DSTATE_SO_DECL_LIST_length 0x00000000
+
+#define GEN9_SO_DECL_ENTRY_length 0x00000002
+
+#define GEN9_SO_DECL_length 0x00000001
+
+struct GEN9_SO_DECL {
+   uint32_t                                     OutputBufferSlot;
+   uint32_t                                     HoleFlag;
+   uint32_t                                     RegisterIndex;
+   uint32_t                                     ComponentMask;
+};
+
+static inline void
+GEN9_SO_DECL_pack(__gen_user_data *data, void * restrict dst,
+                  const struct GEN9_SO_DECL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->OutputBufferSlot, 12, 13) |
+      __gen_field(values->HoleFlag, 11, 11) |
+      __gen_field(values->RegisterIndex, 4, 9) |
+      __gen_field(values->ComponentMask, 0, 3) |
+      0;
+
+}
+
+struct GEN9_SO_DECL_ENTRY {
+   struct GEN9_SO_DECL                          Stream3Decl;
+   struct GEN9_SO_DECL                          Stream2Decl;
+   struct GEN9_SO_DECL                          Stream1Decl;
+   struct GEN9_SO_DECL                          Stream0Decl;
+};
+
+static inline void
+GEN9_SO_DECL_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN9_SO_DECL_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_Stream3Decl;
+   GEN9_SO_DECL_pack(data, &dw_Stream3Decl, &values->Stream3Decl);
+   uint32_t dw_Stream2Decl;
+   GEN9_SO_DECL_pack(data, &dw_Stream2Decl, &values->Stream2Decl);
+   uint32_t dw_Stream1Decl;
+   GEN9_SO_DECL_pack(data, &dw_Stream1Decl, &values->Stream1Decl);
+   uint32_t dw_Stream0Decl;
+   GEN9_SO_DECL_pack(data, &dw_Stream0Decl, &values->Stream0Decl);
+   uint64_t qw0 =
+      __gen_field(dw_Stream3Decl, 48, 63) |
+      __gen_field(dw_Stream2Decl, 32, 47) |
+      __gen_field(dw_Stream1Decl, 16, 31) |
+      __gen_field(dw_Stream0Decl, 0, 15) |
+      0;
+
+   dw[0] = qw0;
+   dw[1] = qw0 >> 32;
+
+}
+
+struct GEN9_3DSTATE_SO_DECL_LIST {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StreamtoBufferSelects3;
+   uint32_t                                     StreamtoBufferSelects2;
+   uint32_t                                     StreamtoBufferSelects1;
+   uint32_t                                     StreamtoBufferSelects0;
+   uint32_t                                     NumEntries3;
+   uint32_t                                     NumEntries2;
+   uint32_t                                     NumEntries1;
+   uint32_t                                     NumEntries0;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_SO_DECL_LIST_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_3DSTATE_SO_DECL_LIST * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 8) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StreamtoBufferSelects3, 12, 15) |
+      __gen_field(values->StreamtoBufferSelects2, 8, 11) |
+      __gen_field(values->StreamtoBufferSelects1, 4, 7) |
+      __gen_field(values->StreamtoBufferSelects0, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->NumEntries3, 24, 31) |
+      __gen_field(values->NumEntries2, 16, 23) |
+      __gen_field(values->NumEntries1, 8, 15) |
+      __gen_field(values->NumEntries0, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_STENCIL_BUFFER_length_bias 0x00000002
+#define GEN9_3DSTATE_STENCIL_BUFFER_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  6,                  \
+   .DwordLength          =  3
+
+#define GEN9_3DSTATE_STENCIL_BUFFER_length 0x00000005
+
+struct GEN9_3DSTATE_STENCIL_BUFFER {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StencilBufferEnable;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      StencilBufferObjectControlState;
+   uint32_t                                     SurfacePitch;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     SurfaceQPitch;
+};
+
+static inline void
+GEN9_3DSTATE_STENCIL_BUFFER_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN9_3DSTATE_STENCIL_BUFFER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw_StencilBufferObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_StencilBufferObjectControlState, &values->StencilBufferObjectControlState);
+   dw[1] =
+      __gen_field(values->StencilBufferEnable, 31, 31) |
+      __gen_field(dw_StencilBufferObjectControlState, 22, 28) |
+      __gen_field(values->SurfacePitch, 0, 16) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->SurfaceBaseAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   dw[4] =
+      __gen_field(values->SurfaceQPitch, 0, 14) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_STREAMOUT_length_bias 0x00000002
+#define GEN9_3DSTATE_STREAMOUT_header           \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 30,                  \
+   .DwordLength          =  3
+
+#define GEN9_3DSTATE_STREAMOUT_length 0x00000005
+
+struct GEN9_3DSTATE_STREAMOUT {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SOFunctionEnable;
+   uint32_t                                     APIRenderingDisable;
+   uint32_t                                     RenderStreamSelect;
+#define     LEADING                                            0
+#define     TRAILING                                           1
+   uint32_t                                     ReorderMode;
+   bool                                         SOStatisticsEnable;
+#define     Normal                                             0
+#define     Resreved                                           1
+#define     Force_Off                                          2
+#define     Force_on                                           3
+   uint32_t                                     ForceRendering;
+   uint32_t                                     Stream3VertexReadOffset;
+   uint32_t                                     Stream3VertexReadLength;
+   uint32_t                                     Stream2VertexReadOffset;
+   uint32_t                                     Stream2VertexReadLength;
+   uint32_t                                     Stream1VertexReadOffset;
+   uint32_t                                     Stream1VertexReadLength;
+   uint32_t                                     Stream0VertexReadOffset;
+   uint32_t                                     Stream0VertexReadLength;
+   uint32_t                                     Buffer1SurfacePitch;
+   uint32_t                                     Buffer0SurfacePitch;
+   uint32_t                                     Buffer3SurfacePitch;
+   uint32_t                                     Buffer2SurfacePitch;
+};
+
+static inline void
+GEN9_3DSTATE_STREAMOUT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_3DSTATE_STREAMOUT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->SOFunctionEnable, 31, 31) |
+      __gen_field(values->APIRenderingDisable, 30, 30) |
+      __gen_field(values->RenderStreamSelect, 27, 28) |
+      __gen_field(values->ReorderMode, 26, 26) |
+      __gen_field(values->SOStatisticsEnable, 25, 25) |
+      __gen_field(values->ForceRendering, 23, 24) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Stream3VertexReadOffset, 29, 29) |
+      __gen_field(values->Stream3VertexReadLength, 24, 28) |
+      __gen_field(values->Stream2VertexReadOffset, 21, 21) |
+      __gen_field(values->Stream2VertexReadLength, 16, 20) |
+      __gen_field(values->Stream1VertexReadOffset, 13, 13) |
+      __gen_field(values->Stream1VertexReadLength, 8, 12) |
+      __gen_field(values->Stream0VertexReadOffset, 5, 5) |
+      __gen_field(values->Stream0VertexReadLength, 0, 4) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Buffer1SurfacePitch, 16, 27) |
+      __gen_field(values->Buffer0SurfacePitch, 0, 11) |
+      0;
+
+   dw[4] =
+      __gen_field(values->Buffer3SurfacePitch, 16, 27) |
+      __gen_field(values->Buffer2SurfacePitch, 0, 11) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_TE_length_bias 0x00000002
+#define GEN9_3DSTATE_TE_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 28,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_TE_length 0x00000004
+
+struct GEN9_3DSTATE_TE {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+#define     INTEGER                                            0
+#define     ODD_FRACTIONAL                                     1
+#define     EVEN_FRACTIONAL                                    2
+   uint32_t                                     Partitioning;
+#define     POINT                                              0
+#define     OUTPUT_LINE                                        1
+#define     OUTPUT_TRI_CW                                      2
+#define     OUTPUT_TRI_CCW                                     3
+   uint32_t                                     OutputTopology;
+#define     QUAD                                               0
+#define     TRI                                                1
+#define     ISOLINE                                            2
+   uint32_t                                     TEDomain;
+#define     HW_TESS                                            0
+   uint32_t                                     TEMode;
+   bool                                         TEEnable;
+   float                                        MaximumTessellationFactorOdd;
+   float                                        MaximumTessellationFactorNotOdd;
+};
+
+static inline void
+GEN9_3DSTATE_TE_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_TE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Partitioning, 12, 13) |
+      __gen_field(values->OutputTopology, 8, 9) |
+      __gen_field(values->TEDomain, 4, 5) |
+      __gen_field(values->TEMode, 1, 2) |
+      __gen_field(values->TEEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_float(values->MaximumTessellationFactorOdd) |
+      0;
+
+   dw[3] =
+      __gen_float(values->MaximumTessellationFactorNotOdd) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_URB_CLEAR_length_bias 0x00000002
+#define GEN9_3DSTATE_URB_CLEAR_header           \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  1,                  \
+   ._3DCommandSubOpcode  = 29,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_URB_CLEAR_length 0x00000002
+
+struct GEN9_3DSTATE_URB_CLEAR {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBClearLength;
+   uint32_t                                     URBAddress;
+};
+
+static inline void
+GEN9_3DSTATE_URB_CLEAR_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_3DSTATE_URB_CLEAR * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBClearLength, 16, 29) |
+      __gen_offset(values->URBAddress, 0, 14) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_URB_DS_length_bias 0x00000002
+#define GEN9_3DSTATE_URB_DS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 50,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_URB_DS_length 0x00000002
+
+struct GEN9_3DSTATE_URB_DS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     DSURBStartingAddress;
+   uint32_t                                     DSURBEntryAllocationSize;
+   uint32_t                                     DSNumberofURBEntries;
+};
+
+static inline void
+GEN9_3DSTATE_URB_DS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN9_3DSTATE_URB_DS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->DSURBStartingAddress, 25, 31) |
+      __gen_field(values->DSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->DSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_URB_GS_length_bias 0x00000002
+#define GEN9_3DSTATE_URB_GS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 51,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_URB_GS_length 0x00000002
+
+struct GEN9_3DSTATE_URB_GS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     GSURBStartingAddress;
+   uint32_t                                     GSURBEntryAllocationSize;
+   uint32_t                                     GSNumberofURBEntries;
+};
+
+static inline void
+GEN9_3DSTATE_URB_GS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN9_3DSTATE_URB_GS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->GSURBStartingAddress, 25, 31) |
+      __gen_field(values->GSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->GSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_URB_HS_length_bias 0x00000002
+#define GEN9_3DSTATE_URB_HS_header              \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 49,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_URB_HS_length 0x00000002
+
+struct GEN9_3DSTATE_URB_HS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     HSURBStartingAddress;
+   uint32_t                                     HSURBEntryAllocationSize;
+   uint32_t                                     HSNumberofURBEntries;
+};
+
+static inline void
+GEN9_3DSTATE_URB_HS_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN9_3DSTATE_URB_HS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->HSURBStartingAddress, 25, 31) |
+      __gen_field(values->HSURBEntryAllocationSize, 16, 24) |
+      __gen_field(values->HSNumberofURBEntries, 0, 15) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VERTEX_BUFFERS_length_bias 0x00000002
+#define GEN9_3DSTATE_VERTEX_BUFFERS_header      \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  8
+
+#define GEN9_3DSTATE_VERTEX_BUFFERS_length 0x00000000
+
+#define GEN9_VERTEX_BUFFER_STATE_length 0x00000004
+
+struct GEN9_VERTEX_BUFFER_STATE {
+   uint32_t                                     VertexBufferIndex;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   uint32_t                                     AddressModifyEnable;
+   bool                                         NullVertexBuffer;
+   uint32_t                                     BufferPitch;
+   __gen_address_type                           BufferStartingAddress;
+   uint32_t                                     BufferSize;
+};
+
+static inline void
+GEN9_VERTEX_BUFFER_STATE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_VERTEX_BUFFER_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   dw[0] =
+      __gen_field(values->VertexBufferIndex, 26, 31) |
+      __gen_field(dw_MemoryObjectControlState, 16, 22) |
+      __gen_field(values->AddressModifyEnable, 14, 14) |
+      __gen_field(values->NullVertexBuffer, 13, 13) |
+      __gen_field(values->BufferPitch, 0, 11) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->BufferStartingAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->BufferSize, 0, 31) |
+      0;
+
+}
+
+struct GEN9_3DSTATE_VERTEX_BUFFERS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_VERTEX_BUFFERS_pack(__gen_user_data *data, void * restrict dst,
+                                 const struct GEN9_3DSTATE_VERTEX_BUFFERS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_VERTEX_ELEMENTS_length_bias 0x00000002
+#define GEN9_3DSTATE_VERTEX_ELEMENTS_header     \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  =  9
+
+#define GEN9_3DSTATE_VERTEX_ELEMENTS_length 0x00000000
+
+#define GEN9_VERTEX_ELEMENT_STATE_length 0x00000002
+
+struct GEN9_VERTEX_ELEMENT_STATE {
+   uint32_t                                     VertexBufferIndex;
+   bool                                         Valid;
+   uint32_t                                     SourceElementFormat;
+   bool                                         EdgeFlagEnable;
+   uint32_t                                     SourceElementOffset;
+   uint32_t                                     Component0Control;
+   uint32_t                                     Component1Control;
+   uint32_t                                     Component2Control;
+   uint32_t                                     Component3Control;
+};
+
+static inline void
+GEN9_VERTEX_ELEMENT_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_VERTEX_ELEMENT_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->VertexBufferIndex, 26, 31) |
+      __gen_field(values->Valid, 25, 25) |
+      __gen_field(values->SourceElementFormat, 16, 24) |
+      __gen_field(values->EdgeFlagEnable, 15, 15) |
+      __gen_field(values->SourceElementOffset, 0, 11) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Component0Control, 28, 30) |
+      __gen_field(values->Component1Control, 24, 26) |
+      __gen_field(values->Component2Control, 20, 22) |
+      __gen_field(values->Component3Control, 16, 18) |
+      0;
+
+}
+
+struct GEN9_3DSTATE_VERTEX_ELEMENTS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_3DSTATE_VERTEX_ELEMENTS_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN9_3DSTATE_VERTEX_ELEMENTS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_3DSTATE_VF_length_bias 0x00000002
+#define GEN9_3DSTATE_VF_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 12,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_VF_length 0x00000002
+
+struct GEN9_3DSTATE_VF {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         SequentialDrawCutIndexEnable;
+   bool                                         ComponentPackingEnable;
+   bool                                         IndexedDrawCutIndexEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CutIndex;
+};
+
+static inline void
+GEN9_3DSTATE_VF_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_VF * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->SequentialDrawCutIndexEnable, 10, 10) |
+      __gen_field(values->ComponentPackingEnable, 9, 9) |
+      __gen_field(values->IndexedDrawCutIndexEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->CutIndex, 0, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VF_COMPONENT_PACKING_length_bias 0x00000002
+#define GEN9_3DSTATE_VF_COMPONENT_PACKING_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 85,                  \
+   .DwordLength          =  3
+
+#define GEN9_3DSTATE_VF_COMPONENT_PACKING_length 0x00000005
+
+struct GEN9_3DSTATE_VF_COMPONENT_PACKING {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     VertexElement07Enables;
+   uint32_t                                     VertexElement06Enables;
+   uint32_t                                     VertexElement05Enables;
+   uint32_t                                     VertexElement04Enables;
+   uint32_t                                     VertexElement03Enables;
+   uint32_t                                     VertexElement02Enables;
+   uint32_t                                     VertexElement01Enables;
+   uint32_t                                     VertexElement00Enables;
+   uint32_t                                     VertexElement15Enables;
+   uint32_t                                     VertexElement14Enables;
+   uint32_t                                     VertexElement13Enables;
+   uint32_t                                     VertexElement12Enables;
+   uint32_t                                     VertexElement11Enables;
+   uint32_t                                     VertexElement10Enables;
+   uint32_t                                     VertexElement09Enables;
+   uint32_t                                     VertexElement08Enables;
+   uint32_t                                     VertexElement23Enables;
+   uint32_t                                     VertexElement22Enables;
+   uint32_t                                     VertexElement21Enables;
+   uint32_t                                     VertexElement20Enables;
+   uint32_t                                     VertexElement19Enables;
+   uint32_t                                     VertexElement18Enables;
+   uint32_t                                     VertexElement17Enables;
+   uint32_t                                     VertexElement16Enables;
+   uint32_t                                     VertexElement31Enables;
+   uint32_t                                     VertexElement30Enables;
+   uint32_t                                     VertexElement29Enables;
+   uint32_t                                     VertexElement28Enables;
+   uint32_t                                     VertexElement27Enables;
+   uint32_t                                     VertexElement26Enables;
+   uint32_t                                     VertexElement25Enables;
+   uint32_t                                     VertexElement24Enables;
+};
+
+static inline void
+GEN9_3DSTATE_VF_COMPONENT_PACKING_pack(__gen_user_data *data, void * restrict dst,
+                                       const struct GEN9_3DSTATE_VF_COMPONENT_PACKING * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->VertexElement07Enables, 28, 31) |
+      __gen_field(values->VertexElement06Enables, 24, 27) |
+      __gen_field(values->VertexElement05Enables, 20, 23) |
+      __gen_field(values->VertexElement04Enables, 16, 19) |
+      __gen_field(values->VertexElement03Enables, 12, 15) |
+      __gen_field(values->VertexElement02Enables, 8, 11) |
+      __gen_field(values->VertexElement01Enables, 4, 7) |
+      __gen_field(values->VertexElement00Enables, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_field(values->VertexElement15Enables, 28, 31) |
+      __gen_field(values->VertexElement14Enables, 24, 27) |
+      __gen_field(values->VertexElement13Enables, 20, 23) |
+      __gen_field(values->VertexElement12Enables, 16, 19) |
+      __gen_field(values->VertexElement11Enables, 12, 15) |
+      __gen_field(values->VertexElement10Enables, 8, 11) |
+      __gen_field(values->VertexElement09Enables, 4, 7) |
+      __gen_field(values->VertexElement08Enables, 0, 3) |
+      0;
+
+   dw[3] =
+      __gen_field(values->VertexElement23Enables, 28, 31) |
+      __gen_field(values->VertexElement22Enables, 24, 27) |
+      __gen_field(values->VertexElement21Enables, 20, 23) |
+      __gen_field(values->VertexElement20Enables, 16, 19) |
+      __gen_field(values->VertexElement19Enables, 12, 15) |
+      __gen_field(values->VertexElement18Enables, 8, 11) |
+      __gen_field(values->VertexElement17Enables, 4, 7) |
+      __gen_field(values->VertexElement16Enables, 0, 3) |
+      0;
+
+   dw[4] =
+      __gen_field(values->VertexElement31Enables, 28, 31) |
+      __gen_field(values->VertexElement30Enables, 24, 27) |
+      __gen_field(values->VertexElement29Enables, 20, 23) |
+      __gen_field(values->VertexElement28Enables, 16, 19) |
+      __gen_field(values->VertexElement27Enables, 12, 15) |
+      __gen_field(values->VertexElement26Enables, 8, 11) |
+      __gen_field(values->VertexElement25Enables, 4, 7) |
+      __gen_field(values->VertexElement24Enables, 0, 3) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VF_INSTANCING_length_bias 0x00000002
+#define GEN9_3DSTATE_VF_INSTANCING_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 73,                  \
+   .DwordLength          =  1
+
+#define GEN9_3DSTATE_VF_INSTANCING_length 0x00000003
+
+struct GEN9_3DSTATE_VF_INSTANCING {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         InstancingEnable;
+   uint32_t                                     VertexElementIndex;
+   uint32_t                                     InstanceDataStepRate;
+};
+
+static inline void
+GEN9_3DSTATE_VF_INSTANCING_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN9_3DSTATE_VF_INSTANCING * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InstancingEnable, 8, 8) |
+      __gen_field(values->VertexElementIndex, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->InstanceDataStepRate, 0, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VF_SGVS_length_bias 0x00000002
+#define GEN9_3DSTATE_VF_SGVS_header             \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 74,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_VF_SGVS_length 0x00000002
+
+struct GEN9_3DSTATE_VF_SGVS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         InstanceIDEnable;
+#define     COMP_0                                             0
+#define     COMP_1                                             1
+#define     COMP_2                                             2
+#define     COMP_3                                             3
+   uint32_t                                     InstanceIDComponentNumber;
+   uint32_t                                     InstanceIDElementOffset;
+   bool                                         VertexIDEnable;
+#define     COMP_0                                             0
+#define     COMP_1                                             1
+#define     COMP_2                                             2
+#define     COMP_3                                             3
+   uint32_t                                     VertexIDComponentNumber;
+   uint32_t                                     VertexIDElementOffset;
+};
+
+static inline void
+GEN9_3DSTATE_VF_SGVS_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN9_3DSTATE_VF_SGVS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InstanceIDEnable, 31, 31) |
+      __gen_field(values->InstanceIDComponentNumber, 29, 30) |
+      __gen_field(values->InstanceIDElementOffset, 16, 21) |
+      __gen_field(values->VertexIDEnable, 15, 15) |
+      __gen_field(values->VertexIDComponentNumber, 13, 14) |
+      __gen_field(values->VertexIDElementOffset, 0, 5) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VF_STATISTICS_length_bias 0x00000001
+#define GEN9_3DSTATE_VF_STATISTICS_header       \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  1,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 11
+
+#define GEN9_3DSTATE_VF_STATISTICS_length 0x00000001
+
+struct GEN9_3DSTATE_VF_STATISTICS {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   bool                                         StatisticsEnable;
+};
+
+static inline void
+GEN9_3DSTATE_VF_STATISTICS_pack(__gen_user_data *data, void * restrict dst,
+                                const struct GEN9_3DSTATE_VF_STATISTICS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->StatisticsEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VF_TOPOLOGY_length_bias 0x00000002
+#define GEN9_3DSTATE_VF_TOPOLOGY_header         \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 75,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_VF_TOPOLOGY_length 0x00000002
+
+struct GEN9_3DSTATE_VF_TOPOLOGY {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     PrimitiveTopologyType;
+};
+
+static inline void
+GEN9_3DSTATE_VF_TOPOLOGY_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_3DSTATE_VF_TOPOLOGY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->PrimitiveTopologyType, 0, 5) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_CC_length_bias 0x00000002
+#define GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_CC_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 35,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_CC_length 0x00000002
+
+struct GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_CC {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CCViewportPointer;
+};
+
+static inline void
+GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_CC_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_CC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->CCViewportPointer, 5, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_length_bias 0x00000002
+#define GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_header\
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 33,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_length 0x00000002
+
+struct GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     SFClipViewportPointer;
+};
+
+static inline void
+GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_pack(__gen_user_data *data, void * restrict dst,
+                                                  const struct GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->SFClipViewportPointer, 6, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_WM_length_bias 0x00000002
+#define GEN9_3DSTATE_WM_header                  \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 20,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_WM_length 0x00000002
+
+struct GEN9_3DSTATE_WM {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         StatisticsEnable;
+   bool                                         LegacyDepthBufferClearEnable;
+   bool                                         LegacyDepthBufferResolveEnable;
+   bool                                         LegacyHierarchicalDepthBufferResolveEnable;
+   bool                                         LegacyDiamondLineRasterization;
+#define     NORMAL                                             0
+#define     PSEXEC                                             1
+#define     PREPS                                              2
+   uint32_t                                     EarlyDepthStencilControl;
+#define     Normal                                             0
+#define     ForceOff                                           1
+#define     ForceON                                            2
+   uint32_t                                     ForceThreadDispatchEnable;
+#define     INTERP_PIXEL                                       0
+#define     INTERP_CENTROID                                    2
+#define     INTERP_SAMPLE                                      3
+   uint32_t                                     PositionZWInterpolationMode;
+   uint32_t                                     BarycentricInterpolationMode;
+#define     _05pixels                                          0
+#define     _10pixels                                          1
+#define     _20pixels                                          2
+#define     _40pixels                                          3
+   uint32_t                                     LineEndCapAntialiasingRegionWidth;
+#define     _05pixels                                          0
+#define     _10pixels                                          1
+#define     _20pixels                                          2
+#define     _40pixels                                          3
+   uint32_t                                     LineAntialiasingRegionWidth;
+   bool                                         PolygonStippleEnable;
+   bool                                         LineStippleEnable;
+#define     RASTRULE_UPPER_LEFT                                0
+#define     RASTRULE_UPPER_RIGHT                               1
+   uint32_t                                     PointRasterizationRule;
+#define     Normal                                             0
+#define     ForceOff                                           1
+#define     ForceON                                            2
+   uint32_t                                     ForceKillPixelEnable;
+};
+
+static inline void
+GEN9_3DSTATE_WM_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_3DSTATE_WM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StatisticsEnable, 31, 31) |
+      __gen_field(values->LegacyDepthBufferClearEnable, 30, 30) |
+      __gen_field(values->LegacyDepthBufferResolveEnable, 28, 28) |
+      __gen_field(values->LegacyHierarchicalDepthBufferResolveEnable, 27, 27) |
+      __gen_field(values->LegacyDiamondLineRasterization, 26, 26) |
+      __gen_field(values->EarlyDepthStencilControl, 21, 22) |
+      __gen_field(values->ForceThreadDispatchEnable, 19, 20) |
+      __gen_field(values->PositionZWInterpolationMode, 17, 18) |
+      __gen_field(values->BarycentricInterpolationMode, 11, 16) |
+      __gen_field(values->LineEndCapAntialiasingRegionWidth, 8, 9) |
+      __gen_field(values->LineAntialiasingRegionWidth, 6, 7) |
+      __gen_field(values->PolygonStippleEnable, 4, 4) |
+      __gen_field(values->LineStippleEnable, 3, 3) |
+      __gen_field(values->PointRasterizationRule, 2, 2) |
+      __gen_field(values->ForceKillPixelEnable, 0, 1) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_WM_CHROMAKEY_length_bias 0x00000002
+#define GEN9_3DSTATE_WM_CHROMAKEY_header        \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 76,                  \
+   .DwordLength          =  0
+
+#define GEN9_3DSTATE_WM_CHROMAKEY_length 0x00000002
+
+struct GEN9_3DSTATE_WM_CHROMAKEY {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         ChromaKeyKillEnable;
+};
+
+static inline void
+GEN9_3DSTATE_WM_CHROMAKEY_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_3DSTATE_WM_CHROMAKEY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ChromaKeyKillEnable, 31, 31) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_WM_DEPTH_STENCIL_length_bias 0x00000002
+#define GEN9_3DSTATE_WM_DEPTH_STENCIL_header    \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 78,                  \
+   .DwordLength          =  2
+
+#define GEN9_3DSTATE_WM_DEPTH_STENCIL_length 0x00000004
+
+struct GEN9_3DSTATE_WM_DEPTH_STENCIL {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StencilFailOp;
+   uint32_t                                     StencilPassDepthFailOp;
+   uint32_t                                     StencilPassDepthPassOp;
+   uint32_t                                     BackfaceStencilTestFunction;
+   uint32_t                                     BackfaceStencilFailOp;
+   uint32_t                                     BackfaceStencilPassDepthFailOp;
+   uint32_t                                     BackfaceStencilPassDepthPassOp;
+   uint32_t                                     StencilTestFunction;
+   uint32_t                                     DepthTestFunction;
+   bool                                         DoubleSidedStencilEnable;
+   bool                                         StencilTestEnable;
+   bool                                         StencilBufferWriteEnable;
+   bool                                         DepthTestEnable;
+   bool                                         DepthBufferWriteEnable;
+   uint32_t                                     StencilTestMask;
+   uint32_t                                     StencilWriteMask;
+   uint32_t                                     BackfaceStencilTestMask;
+   uint32_t                                     BackfaceStencilWriteMask;
+   uint32_t                                     StencilReferenceValue;
+   uint32_t                                     BackfaceStencilReferenceValue;
+};
+
+static inline void
+GEN9_3DSTATE_WM_DEPTH_STENCIL_pack(__gen_user_data *data, void * restrict dst,
+                                   const struct GEN9_3DSTATE_WM_DEPTH_STENCIL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StencilFailOp, 29, 31) |
+      __gen_field(values->StencilPassDepthFailOp, 26, 28) |
+      __gen_field(values->StencilPassDepthPassOp, 23, 25) |
+      __gen_field(values->BackfaceStencilTestFunction, 20, 22) |
+      __gen_field(values->BackfaceStencilFailOp, 17, 19) |
+      __gen_field(values->BackfaceStencilPassDepthFailOp, 14, 16) |
+      __gen_field(values->BackfaceStencilPassDepthPassOp, 11, 13) |
+      __gen_field(values->StencilTestFunction, 8, 10) |
+      __gen_field(values->DepthTestFunction, 5, 7) |
+      __gen_field(values->DoubleSidedStencilEnable, 4, 4) |
+      __gen_field(values->StencilTestEnable, 3, 3) |
+      __gen_field(values->StencilBufferWriteEnable, 2, 2) |
+      __gen_field(values->DepthTestEnable, 1, 1) |
+      __gen_field(values->DepthBufferWriteEnable, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_field(values->StencilTestMask, 24, 31) |
+      __gen_field(values->StencilWriteMask, 16, 23) |
+      __gen_field(values->BackfaceStencilTestMask, 8, 15) |
+      __gen_field(values->BackfaceStencilWriteMask, 0, 7) |
+      0;
+
+   dw[3] =
+      __gen_field(values->StencilReferenceValue, 8, 15) |
+      __gen_field(values->BackfaceStencilReferenceValue, 0, 7) |
+      0;
+
+}
+
+#define GEN9_3DSTATE_WM_HZ_OP_length_bias 0x00000002
+#define GEN9_3DSTATE_WM_HZ_OP_header            \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  0,                  \
+   ._3DCommandSubOpcode  = 82,                  \
+   .DwordLength          =  3
+
+#define GEN9_3DSTATE_WM_HZ_OP_length 0x00000005
+
+struct GEN9_3DSTATE_WM_HZ_OP {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         StencilBufferClearEnable;
+   bool                                         DepthBufferClearEnable;
+   bool                                         ScissorRectangleEnable;
+   bool                                         DepthBufferResolveEnable;
+   bool                                         HierarchicalDepthBufferResolveEnable;
+   uint32_t                                     PixelPositionOffsetEnable;
+   bool                                         FullSurfaceDepthClear;
+   uint32_t                                     StencilClearValue;
+   uint32_t                                     NumberofMultisamples;
+   uint32_t                                     ClearRectangleYMin;
+   uint32_t                                     ClearRectangleXMin;
+   uint32_t                                     ClearRectangleYMax;
+   uint32_t                                     ClearRectangleXMax;
+   uint32_t                                     SampleMask;
+};
+
+static inline void
+GEN9_3DSTATE_WM_HZ_OP_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_3DSTATE_WM_HZ_OP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StencilBufferClearEnable, 31, 31) |
+      __gen_field(values->DepthBufferClearEnable, 30, 30) |
+      __gen_field(values->ScissorRectangleEnable, 29, 29) |
+      __gen_field(values->DepthBufferResolveEnable, 28, 28) |
+      __gen_field(values->HierarchicalDepthBufferResolveEnable, 27, 27) |
+      __gen_field(values->PixelPositionOffsetEnable, 26, 26) |
+      __gen_field(values->FullSurfaceDepthClear, 25, 25) |
+      __gen_field(values->StencilClearValue, 16, 23) |
+      __gen_field(values->NumberofMultisamples, 13, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ClearRectangleYMin, 16, 31) |
+      __gen_field(values->ClearRectangleXMin, 0, 15) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ClearRectangleYMax, 16, 31) |
+      __gen_field(values->ClearRectangleXMax, 0, 15) |
+      0;
+
+   dw[4] =
+      __gen_field(values->SampleMask, 0, 15) |
+      0;
+
+}
+
+#define GEN9_GPGPU_WALKER_length_bias 0x00000002
+#define GEN9_GPGPU_WALKER_header                \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  5,                  \
+   .DwordLength          = 13
+
+#define GEN9_GPGPU_WALKER_length 0x0000000f
+
+struct GEN9_GPGPU_WALKER {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   bool                                         IndirectParameterEnable;
+   bool                                         PredicateEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   uint32_t                                     IndirectDataLength;
+   uint32_t                                     IndirectDataStartAddress;
+#define     SIMD8                                              0
+#define     SIMD16                                             1
+#define     SIMD32                                             2
+   uint32_t                                     SIMDSize;
+   uint32_t                                     ThreadDepthCounterMaximum;
+   uint32_t                                     ThreadHeightCounterMaximum;
+   uint32_t                                     ThreadWidthCounterMaximum;
+   uint32_t                                     ThreadGroupIDStartingX;
+   uint32_t                                     ThreadGroupIDXDimension;
+   uint32_t                                     ThreadGroupIDStartingY;
+   uint32_t                                     ThreadGroupIDYDimension;
+   uint32_t                                     ThreadGroupIDStartingResumeZ;
+   uint32_t                                     ThreadGroupIDZDimension;
+   uint32_t                                     RightExecutionMask;
+   uint32_t                                     BottomExecutionMask;
+};
+
+static inline void
+GEN9_GPGPU_WALKER_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN9_GPGPU_WALKER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->IndirectParameterEnable, 10, 10) |
+      __gen_field(values->PredicateEnable, 8, 8) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->IndirectDataStartAddress, 6, 31) |
+      0;
+
+   dw[4] =
+      __gen_field(values->SIMDSize, 30, 31) |
+      __gen_field(values->ThreadDepthCounterMaximum, 16, 21) |
+      __gen_field(values->ThreadHeightCounterMaximum, 8, 13) |
+      __gen_field(values->ThreadWidthCounterMaximum, 0, 5) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ThreadGroupIDStartingX, 0, 31) |
+      0;
+
+   dw[6] =
+      0;
+
+   dw[7] =
+      __gen_field(values->ThreadGroupIDXDimension, 0, 31) |
+      0;
+
+   dw[8] =
+      __gen_field(values->ThreadGroupIDStartingY, 0, 31) |
+      0;
+
+   dw[9] =
+      0;
+
+   dw[10] =
+      __gen_field(values->ThreadGroupIDYDimension, 0, 31) |
+      0;
+
+   dw[11] =
+      __gen_field(values->ThreadGroupIDStartingResumeZ, 0, 31) |
+      0;
+
+   dw[12] =
+      __gen_field(values->ThreadGroupIDZDimension, 0, 31) |
+      0;
+
+   dw[13] =
+      __gen_field(values->RightExecutionMask, 0, 31) |
+      0;
+
+   dw[14] =
+      __gen_field(values->BottomExecutionMask, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MEDIA_CURBE_LOAD_length_bias 0x00000002
+#define GEN9_MEDIA_CURBE_LOAD_header            \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  1,                  \
+   .DwordLength          =  2
+
+#define GEN9_MEDIA_CURBE_LOAD_length 0x00000004
+
+struct GEN9_MEDIA_CURBE_LOAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     CURBETotalDataLength;
+   uint32_t                                     CURBEDataStartAddress;
+};
+
+static inline void
+GEN9_MEDIA_CURBE_LOAD_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_MEDIA_CURBE_LOAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->CURBETotalDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_field(values->CURBEDataStartAddress, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MEDIA_INTERFACE_DESCRIPTOR_LOAD_length_bias 0x00000002
+#define GEN9_MEDIA_INTERFACE_DESCRIPTOR_LOAD_header\
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  2,                  \
+   .DwordLength          =  2
+
+#define GEN9_MEDIA_INTERFACE_DESCRIPTOR_LOAD_length 0x00000004
+
+struct GEN9_MEDIA_INTERFACE_DESCRIPTOR_LOAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorTotalLength;
+   uint32_t                                     InterfaceDescriptorDataStartAddress;
+};
+
+static inline void
+GEN9_MEDIA_INTERFACE_DESCRIPTOR_LOAD_pack(__gen_user_data *data, void * restrict dst,
+                                          const struct GEN9_MEDIA_INTERFACE_DESCRIPTOR_LOAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      0;
+
+   dw[2] =
+      __gen_field(values->InterfaceDescriptorTotalLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->InterfaceDescriptorDataStartAddress, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MEDIA_OBJECT_length_bias 0x00000002
+#define GEN9_MEDIA_OBJECT_header                \
+   .CommandType          =  3,                  \
+   .MediaCommandPipeline =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .MediaCommandSubOpcode =  0
+
+#define GEN9_MEDIA_OBJECT_length 0x00000000
+
+struct GEN9_MEDIA_OBJECT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MediaCommandPipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     MediaCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+   uint32_t                                     SliceDestinationSelectMSBs;
+#define     Nothreadsynchronization                            0
+#define     Threaddispatchissynchronizedbythespawnrootthreadmessage       1
+   uint32_t                                     ThreadSynchronization;
+   uint32_t                                     ForceDestination;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+#define     Slice0                                             0
+#define     Slice1                                             1
+#define     Slice2                                             2
+   uint32_t                                     SliceDestinationSelect;
+#define     Subslice3                                          3
+#define     SubSlice2                                          2
+#define     SubSlice1                                          1
+#define     SubSlice0                                          0
+   uint32_t                                     SubSliceDestinationSelect;
+   uint32_t                                     IndirectDataLength;
+   __gen_address_type                           IndirectDataStartAddress;
+   uint32_t                                     ScoredboardY;
+   uint32_t                                     ScoreboardX;
+   uint32_t                                     ScoreboardColor;
+   bool                                         ScoreboardMask;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_MEDIA_OBJECT_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN9_MEDIA_OBJECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MediaCommandPipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->MediaCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->SliceDestinationSelectMSBs, 25, 26) |
+      __gen_field(values->ThreadSynchronization, 24, 24) |
+      __gen_field(values->ForceDestination, 22, 22) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->SliceDestinationSelect, 19, 20) |
+      __gen_field(values->SubSliceDestinationSelect, 17, 18) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->IndirectDataStartAddress, dw3);
+
+   dw[4] =
+      __gen_field(values->ScoredboardY, 16, 24) |
+      __gen_field(values->ScoreboardX, 0, 8) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardColor, 16, 19) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_MEDIA_OBJECT_GRPID_length_bias 0x00000002
+#define GEN9_MEDIA_OBJECT_GRPID_header          \
+   .CommandType          =  3,                  \
+   .MediaCommandPipeline =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .MediaCommandSubOpcode =  6
+
+#define GEN9_MEDIA_OBJECT_GRPID_length 0x00000000
+
+struct GEN9_MEDIA_OBJECT_GRPID {
+   uint32_t                                     CommandType;
+   uint32_t                                     MediaCommandPipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     MediaCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   uint32_t                                     SliceDestinationSelectMSB;
+   uint32_t                                     EndofThreadGroup;
+   uint32_t                                     ForceDestination;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+#define     Slice0                                             0
+#define     Slice1                                             1
+#define     Slice2                                             2
+   uint32_t                                     SliceDestinationSelect;
+#define     Subslice3                                          3
+#define     SubSlice2                                          2
+#define     SubSlice1                                          1
+#define     SubSlice0                                          0
+   uint32_t                                     SubSliceDestinationSelect;
+   uint32_t                                     IndirectDataLength;
+   __gen_address_type                           IndirectDataStartAddress;
+   uint32_t                                     ScoreboardY;
+   uint32_t                                     ScoreboardX;
+   uint32_t                                     ScoreboardColor;
+   bool                                         ScoreboardMask;
+   uint32_t                                     GroupID;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_MEDIA_OBJECT_GRPID_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN9_MEDIA_OBJECT_GRPID * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MediaCommandPipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->MediaCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->SliceDestinationSelectMSB, 24, 24) |
+      __gen_field(values->EndofThreadGroup, 23, 23) |
+      __gen_field(values->ForceDestination, 22, 22) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->SliceDestinationSelect, 19, 20) |
+      __gen_field(values->SubSliceDestinationSelect, 17, 18) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->IndirectDataStartAddress, dw3);
+
+   dw[4] =
+      __gen_field(values->ScoreboardY, 16, 24) |
+      __gen_field(values->ScoreboardX, 0, 8) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ScoreboardColor, 16, 19) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->GroupID, 0, 31) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_MEDIA_OBJECT_PRT_length_bias 0x00000002
+#define GEN9_MEDIA_OBJECT_PRT_header            \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  2,                  \
+   .DwordLength          = 14
+
+#define GEN9_MEDIA_OBJECT_PRT_length 0x00000010
+
+struct GEN9_MEDIA_OBJECT_PRT {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+   bool                                         ChildrenPresent;
+   bool                                         PRT_FenceNeeded;
+#define     Rootthreadqueue                                    0
+#define     VFEstateflush                                      1
+   uint32_t                                     PRT_FenceType;
+   uint32_t                                     InlineData[12];
+};
+
+static inline void
+GEN9_MEDIA_OBJECT_PRT_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_MEDIA_OBJECT_PRT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ChildrenPresent, 31, 31) |
+      __gen_field(values->PRT_FenceNeeded, 23, 23) |
+      __gen_field(values->PRT_FenceType, 22, 22) |
+      0;
+
+   dw[3] =
+      0;
+
+   for (uint32_t i = 0, j = 4; i < 12; i += 1, j++) {
+      dw[j] =
+         __gen_field(values->InlineData[i + 0], 0, 31) |
+         0;
+   }
+
+}
+
+#define GEN9_MEDIA_OBJECT_WALKER_length_bias 0x00000002
+#define GEN9_MEDIA_OBJECT_WALKER_header         \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  1,                  \
+   .SubOpcode            =  3
+
+#define GEN9_MEDIA_OBJECT_WALKER_length 0x00000000
+
+struct GEN9_MEDIA_OBJECT_WALKER {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     InterfaceDescriptorOffset;
+#define     Nothreadsynchronization                            0
+#define     Threaddispatchissynchronizedbythespawnrootthreadmessage       1
+   uint32_t                                     ThreadSynchronization;
+   uint32_t                                     MaskedDispatch;
+#define     Notusingscoreboard                                 0
+#define     Usingscoreboard                                    1
+   uint32_t                                     UseScoreboard;
+   uint32_t                                     IndirectDataLength;
+   uint32_t                                     IndirectDataStartAddress;
+   uint32_t                                     GroupIDLoopSelect;
+   bool                                         ScoreboardMask;
+   uint32_t                                     ColorCountMinusOne;
+   uint32_t                                     MiddleLoopExtraSteps;
+   uint32_t                                     LocalMidLoopUnitY;
+   uint32_t                                     MidLoopUnitX;
+   uint32_t                                     GlobalLoopExecCount;
+   uint32_t                                     LocalLoopExecCount;
+   uint32_t                                     BlockResolutionY;
+   uint32_t                                     BlockResolutionX;
+   uint32_t                                     LocalStartY;
+   uint32_t                                     LocalStartX;
+   uint32_t                                     LocalOuterLoopStrideY;
+   uint32_t                                     LocalOuterLoopStrideX;
+   uint32_t                                     LocalInnerLoopUnitY;
+   uint32_t                                     LocalInnerLoopUnitX;
+   uint32_t                                     GlobalResolutionY;
+   uint32_t                                     GlobalResolutionX;
+   uint32_t                                     GlobalStartY;
+   uint32_t                                     GlobalStartX;
+   uint32_t                                     GlobalOuterLoopStrideY;
+   uint32_t                                     GlobalOuterLoopStrideX;
+   uint32_t                                     GlobalInnerLoopUnitY;
+   uint32_t                                     GlobalInnerLoopUnitX;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_MEDIA_OBJECT_WALKER_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_MEDIA_OBJECT_WALKER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ThreadSynchronization, 24, 24) |
+      __gen_field(values->MaskedDispatch, 22, 23) |
+      __gen_field(values->UseScoreboard, 21, 21) |
+      __gen_field(values->IndirectDataLength, 0, 16) |
+      0;
+
+   dw[3] =
+      __gen_field(values->IndirectDataStartAddress, 0, 31) |
+      0;
+
+   dw[4] =
+      0;
+
+   dw[5] =
+      __gen_field(values->GroupIDLoopSelect, 8, 31) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ColorCountMinusOne, 24, 27) |
+      __gen_field(values->MiddleLoopExtraSteps, 16, 20) |
+      __gen_field(values->LocalMidLoopUnitY, 12, 13) |
+      __gen_field(values->MidLoopUnitX, 8, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->GlobalLoopExecCount, 16, 27) |
+      __gen_field(values->LocalLoopExecCount, 0, 11) |
+      0;
+
+   dw[8] =
+      __gen_field(values->BlockResolutionY, 16, 26) |
+      __gen_field(values->BlockResolutionX, 0, 10) |
+      0;
+
+   dw[9] =
+      __gen_field(values->LocalStartY, 16, 26) |
+      __gen_field(values->LocalStartX, 0, 10) |
+      0;
+
+   dw[10] =
+      0;
+
+   dw[11] =
+      __gen_field(values->LocalOuterLoopStrideY, 16, 27) |
+      __gen_field(values->LocalOuterLoopStrideX, 0, 11) |
+      0;
+
+   dw[12] =
+      __gen_field(values->LocalInnerLoopUnitY, 16, 27) |
+      __gen_field(values->LocalInnerLoopUnitX, 0, 11) |
+      0;
+
+   dw[13] =
+      __gen_field(values->GlobalResolutionY, 16, 26) |
+      __gen_field(values->GlobalResolutionX, 0, 10) |
+      0;
+
+   dw[14] =
+      __gen_field(values->GlobalStartY, 16, 27) |
+      __gen_field(values->GlobalStartX, 0, 11) |
+      0;
+
+   dw[15] =
+      __gen_field(values->GlobalOuterLoopStrideY, 16, 27) |
+      __gen_field(values->GlobalOuterLoopStrideX, 0, 11) |
+      0;
+
+   dw[16] =
+      __gen_field(values->GlobalInnerLoopUnitY, 16, 27) |
+      __gen_field(values->GlobalInnerLoopUnitX, 0, 11) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_MEDIA_STATE_FLUSH_length_bias 0x00000002
+#define GEN9_MEDIA_STATE_FLUSH_header           \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  4,                  \
+   .DwordLength          =  0
+
+#define GEN9_MEDIA_STATE_FLUSH_length 0x00000002
+
+struct GEN9_MEDIA_STATE_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         FlushtoGO;
+   uint32_t                                     WatermarkRequired;
+   uint32_t                                     InterfaceDescriptorOffset;
+};
+
+static inline void
+GEN9_MEDIA_STATE_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_MEDIA_STATE_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->FlushtoGO, 7, 7) |
+      __gen_field(values->WatermarkRequired, 6, 6) |
+      __gen_field(values->InterfaceDescriptorOffset, 0, 5) |
+      0;
+
+}
+
+#define GEN9_MEDIA_VFE_STATE_length_bias 0x00000002
+#define GEN9_MEDIA_VFE_STATE_header             \
+   .CommandType          =  3,                  \
+   .Pipeline             =  2,                  \
+   .MediaCommandOpcode   =  0,                  \
+   .SubOpcode            =  0,                  \
+   .DwordLength          =  7
+
+#define GEN9_MEDIA_VFE_STATE_length 0x00000009
+
+struct GEN9_MEDIA_VFE_STATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     Pipeline;
+   uint32_t                                     MediaCommandOpcode;
+   uint32_t                                     SubOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ScratchSpaceBasePointer;
+   uint32_t                                     StackSize;
+   uint32_t                                     PerThreadScratchSpace;
+   uint32_t                                     ScratchSpaceBasePointerHigh;
+   uint32_t                                     MaximumNumberofThreads;
+   uint32_t                                     NumberofURBEntries;
+#define     Maintainingtheexistingtimestampstate               0
+#define     Resettingrelativetimerandlatchingtheglobaltimestamp       1
+   uint32_t                                     ResetGatewayTimer;
+   uint32_t                                     SliceDisable;
+   uint32_t                                     URBEntryAllocationSize;
+   uint32_t                                     CURBEAllocationSize;
+#define     Scoreboarddisabled                                 0
+#define     Scoreboardenabled                                  1
+   uint32_t                                     ScoreboardEnable;
+#define     StallingScoreboard                                 0
+#define     NonStallingScoreboard                              1
+   uint32_t                                     ScoreboardType;
+   uint32_t                                     ScoreboardMask;
+   uint32_t                                     Scoreboard3DeltaY;
+   uint32_t                                     Scoreboard3DeltaX;
+   uint32_t                                     Scoreboard2DeltaY;
+   uint32_t                                     Scoreboard2DeltaX;
+   uint32_t                                     Scoreboard1DeltaY;
+   uint32_t                                     Scoreboard1DeltaX;
+   uint32_t                                     Scoreboard0DeltaY;
+   uint32_t                                     Scoreboard0DeltaX;
+   uint32_t                                     Scoreboard7DeltaY;
+   uint32_t                                     Scoreboard7DeltaX;
+   uint32_t                                     Scoreboard6DeltaY;
+   uint32_t                                     Scoreboard6DeltaX;
+   uint32_t                                     Scoreboard5DeltaY;
+   uint32_t                                     Scoreboard5DeltaX;
+   uint32_t                                     Scoreboard4DeltaY;
+   uint32_t                                     Scoreboard4DeltaX;
+};
+
+static inline void
+GEN9_MEDIA_VFE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN9_MEDIA_VFE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->Pipeline, 27, 28) |
+      __gen_field(values->MediaCommandOpcode, 24, 26) |
+      __gen_field(values->SubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->ScratchSpaceBasePointer, 10, 31) |
+      __gen_field(values->StackSize, 4, 7) |
+      __gen_field(values->PerThreadScratchSpace, 0, 3) |
+      0;
+
+   dw[2] =
+      __gen_offset(values->ScratchSpaceBasePointerHigh, 0, 15) |
+      0;
+
+   dw[3] =
+      __gen_field(values->MaximumNumberofThreads, 16, 31) |
+      __gen_field(values->NumberofURBEntries, 8, 15) |
+      __gen_field(values->ResetGatewayTimer, 7, 7) |
+      0;
+
+   dw[4] =
+      __gen_field(values->SliceDisable, 0, 1) |
+      0;
+
+   dw[5] =
+      __gen_field(values->URBEntryAllocationSize, 16, 31) |
+      __gen_field(values->CURBEAllocationSize, 0, 15) |
+      0;
+
+   dw[6] =
+      __gen_field(values->ScoreboardEnable, 31, 31) |
+      __gen_field(values->ScoreboardType, 30, 30) |
+      __gen_field(values->ScoreboardMask, 0, 7) |
+      0;
+
+   dw[7] =
+      __gen_field(values->Scoreboard3DeltaY, 28, 31) |
+      __gen_field(values->Scoreboard3DeltaX, 24, 27) |
+      __gen_field(values->Scoreboard2DeltaY, 20, 23) |
+      __gen_field(values->Scoreboard2DeltaX, 16, 19) |
+      __gen_field(values->Scoreboard1DeltaY, 12, 15) |
+      __gen_field(values->Scoreboard1DeltaX, 8, 11) |
+      __gen_field(values->Scoreboard0DeltaY, 4, 7) |
+      __gen_field(values->Scoreboard0DeltaX, 0, 3) |
+      0;
+
+   dw[8] =
+      __gen_field(values->Scoreboard7DeltaY, 28, 31) |
+      __gen_field(values->Scoreboard7DeltaX, 24, 27) |
+      __gen_field(values->Scoreboard6DeltaY, 20, 23) |
+      __gen_field(values->Scoreboard6DeltaX, 16, 19) |
+      __gen_field(values->Scoreboard5DeltaY, 12, 15) |
+      __gen_field(values->Scoreboard5DeltaX, 8, 11) |
+      __gen_field(values->Scoreboard4DeltaY, 4, 7) |
+      __gen_field(values->Scoreboard4DeltaX, 0, 3) |
+      0;
+
+}
+
+#define GEN9_MI_ARB_CHECK_length_bias 0x00000001
+#define GEN9_MI_ARB_CHECK_header                \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  5
+
+#define GEN9_MI_ARB_CHECK_length 0x00000001
+
+struct GEN9_MI_ARB_CHECK {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN9_MI_ARB_CHECK_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN9_MI_ARB_CHECK * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN9_MI_BATCH_BUFFER_END_length_bias 0x00000001
+#define GEN9_MI_BATCH_BUFFER_END_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 10
+
+#define GEN9_MI_BATCH_BUFFER_END_length 0x00000001
+
+struct GEN9_MI_BATCH_BUFFER_END {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN9_MI_BATCH_BUFFER_END_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_MI_BATCH_BUFFER_END * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN9_MI_CLFLUSH_length_bias 0x00000002
+#define GEN9_MI_CLFLUSH_header                  \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 39
+
+#define GEN9_MI_CLFLUSH_length 0x00000000
+
+struct GEN9_MI_CLFLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTT;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           PageBaseAddress;
+   uint32_t                                     StartingCachelineOffset;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_MI_CLFLUSH_pack(__gen_user_data *data, void * restrict dst,
+                     const struct GEN9_MI_CLFLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->DwordLength, 0, 9) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->StartingCachelineOffset, 6, 11) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->PageBaseAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_MI_COPY_MEM_MEM_length_bias 0x00000002
+#define GEN9_MI_COPY_MEM_MEM_header             \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 46,                  \
+   .DwordLength          =  3
+
+#define GEN9_MI_COPY_MEM_MEM_length 0x00000005
+
+struct GEN9_MI_COPY_MEM_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTTSource;
+#define     PerProcessGraphicsAddress                          0
+#define     GlobalGraphicsAddress                              1
+   uint32_t                                     UseGlobalGTTDestination;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           DestinationMemoryAddress;
+   __gen_address_type                           SourceMemoryAddress;
+};
+
+static inline void
+GEN9_MI_COPY_MEM_MEM_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN9_MI_COPY_MEM_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTTSource, 22, 22) |
+      __gen_field(values->UseGlobalGTTDestination, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->DestinationMemoryAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   uint32_t dw3 =
+      0;
+
+   uint64_t qw3 =
+      __gen_combine_address(data, &dw[3], values->SourceMemoryAddress, dw3);
+
+   dw[3] = qw3;
+   dw[4] = qw3 >> 32;
+
+}
+
+#define GEN9_MI_DISPLAY_FLIP_length_bias 0x00000002
+#define GEN9_MI_DISPLAY_FLIP_header             \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 20
+
+#define GEN9_MI_DISPLAY_FLIP_length 0x00000003
+
+struct GEN9_MI_DISPLAY_FLIP {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         AsyncFlipIndicator;
+#define     DisplayPlane1                                      0
+#define     DisplayPlane2                                      1
+#define     DisplayPlane3                                      2
+#define     DisplayPlane4                                      4
+#define     DisplayPlane5                                      5
+#define     DisplayPlane6                                      6
+#define     DisplayPlane7                                      7
+#define     DisplayPlane8                                      8
+#define     DisplayPlane9                                      9
+#define     DisplayPlane10                                    10
+#define     DisplayPlane11                                    11
+#define     DisplayPlane12                                    12
+   uint32_t                                     DisplayPlaneSelect;
+   uint32_t                                     DwordLength;
+   bool                                         Stereoscopic3DMode;
+   uint32_t                                     DisplayBufferPitch;
+#define     Linear                                             0
+#define     TiledX                                             1
+#define     TiledYLegacyYB                                     4
+#define     TiledYF                                            5
+   bool                                         TileParameter;
+   __gen_address_type                           DisplayBufferBaseAddress;
+#define     SyncFlip                                           0
+#define     AsyncFlip                                          1
+#define     Stereo3DFlip                                       2
+   uint32_t                                     FlipType;
+   __gen_address_type                           LeftEyeDisplayBufferBaseAddress;
+};
+
+static inline void
+GEN9_MI_DISPLAY_FLIP_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN9_MI_DISPLAY_FLIP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->AsyncFlipIndicator, 22, 22) |
+      __gen_field(values->DisplayPlaneSelect, 8, 12) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Stereoscopic3DMode, 31, 31) |
+      __gen_field(values->DisplayBufferPitch, 6, 15) |
+      __gen_field(values->TileParameter, 0, 2) |
+      0;
+
+   uint32_t dw2 =
+      __gen_field(values->FlipType, 0, 1) |
+      0;
+
+   dw[2] =
+      __gen_combine_address(data, &dw[2], values->DisplayBufferBaseAddress, dw2);
+
+   uint32_t dw3 =
+      0;
+
+   dw[3] =
+      __gen_combine_address(data, &dw[3], values->LeftEyeDisplayBufferBaseAddress, dw3);
+
+}
+
+#define GEN9_MI_LOAD_REGISTER_MEM_length_bias 0x00000002
+#define GEN9_MI_LOAD_REGISTER_MEM_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 41,                  \
+   .DwordLength          =  2
+
+#define GEN9_MI_LOAD_REGISTER_MEM_length 0x00000004
+
+struct GEN9_MI_LOAD_REGISTER_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         UseGlobalGTT;
+   uint32_t                                     AsyncModeEnable;
+   uint32_t                                     DwordLength;
+   uint32_t                                     RegisterAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN9_MI_LOAD_REGISTER_MEM_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_MI_LOAD_REGISTER_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UseGlobalGTT, 22, 22) |
+      __gen_field(values->AsyncModeEnable, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->RegisterAddress, 2, 22) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN9_MI_LOAD_SCAN_LINES_EXCL_length_bias 0x00000002
+#define GEN9_MI_LOAD_SCAN_LINES_EXCL_header     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 19,                  \
+   .DwordLength          =  0
+
+#define GEN9_MI_LOAD_SCAN_LINES_EXCL_length 0x00000002
+
+struct GEN9_MI_LOAD_SCAN_LINES_EXCL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     DisplayPlaneA                                      0
+#define     DisplayPlaneB                                      1
+#define     DisplayPlaneC                                      4
+   uint32_t                                     DisplayPlaneSelect;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StartScanLineNumber;
+   uint32_t                                     EndScanLineNumber;
+};
+
+static inline void
+GEN9_MI_LOAD_SCAN_LINES_EXCL_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN9_MI_LOAD_SCAN_LINES_EXCL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPlaneSelect, 19, 21) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StartScanLineNumber, 16, 28) |
+      __gen_field(values->EndScanLineNumber, 0, 12) |
+      0;
+
+}
+
+#define GEN9_MI_LOAD_SCAN_LINES_INCL_length_bias 0x00000002
+#define GEN9_MI_LOAD_SCAN_LINES_INCL_header     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 18,                  \
+   .DwordLength          =  0
+
+#define GEN9_MI_LOAD_SCAN_LINES_INCL_length 0x00000002
+
+struct GEN9_MI_LOAD_SCAN_LINES_INCL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     DisplayPlane1A                                     0
+#define     DisplayPlane1B                                     1
+#define     DisplayPlane1C                                     4
+   uint32_t                                     DisplayPlaneSelect;
+#define     NeverForward                                       0
+#define     AlwaysForward                                      1
+#define     ConditionallyForward                               2
+   bool                                         ScanLineEventDoneForward;
+   uint32_t                                     DwordLength;
+   uint32_t                                     StartScanLineNumber;
+   uint32_t                                     EndScanLineNumber;
+};
+
+static inline void
+GEN9_MI_LOAD_SCAN_LINES_INCL_pack(__gen_user_data *data, void * restrict dst,
+                                  const struct GEN9_MI_LOAD_SCAN_LINES_INCL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPlaneSelect, 19, 21) |
+      __gen_field(values->ScanLineEventDoneForward, 17, 18) |
+      __gen_field(values->DwordLength, 0, 5) |
+      0;
+
+   dw[1] =
+      __gen_field(values->StartScanLineNumber, 16, 28) |
+      __gen_field(values->EndScanLineNumber, 0, 12) |
+      0;
+
+}
+
+#define GEN9_MI_LOAD_URB_MEM_length_bias 0x00000002
+#define GEN9_MI_LOAD_URB_MEM_header             \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 44,                  \
+   .DwordLength          =  2
+
+#define GEN9_MI_LOAD_URB_MEM_length 0x00000004
+
+struct GEN9_MI_LOAD_URB_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN9_MI_LOAD_URB_MEM_pack(__gen_user_data *data, void * restrict dst,
+                          const struct GEN9_MI_LOAD_URB_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBAddress, 2, 14) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN9_MI_MATH_length_bias 0x00000002
+#define GEN9_MI_MATH_header                     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 26
+
+#define GEN9_MI_MATH_length 0x00000000
+
+struct GEN9_MI_MATH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     ALUINSTRUCTION1;
+   uint32_t                                     ALUINSTRUCTION2;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_MI_MATH_pack(__gen_user_data *data, void * restrict dst,
+                  const struct GEN9_MI_MATH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ALUINSTRUCTION1, 0, 31) |
+      0;
+
+   dw[2] =
+      __gen_field(values->ALUINSTRUCTION2, 0, 31) |
+      0;
+
+   /* variable length fields follow */
+}
+
+#define GEN9_MI_NOOP_length_bias 0x00000001
+#define GEN9_MI_NOOP_header                     \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  0
+
+#define GEN9_MI_NOOP_length 0x00000001
+
+struct GEN9_MI_NOOP {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         IdentificationNumberRegisterWriteEnable;
+   uint32_t                                     IdentificationNumber;
+};
+
+static inline void
+GEN9_MI_NOOP_pack(__gen_user_data *data, void * restrict dst,
+                  const struct GEN9_MI_NOOP * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->IdentificationNumberRegisterWriteEnable, 22, 22) |
+      __gen_field(values->IdentificationNumber, 0, 21) |
+      0;
+
+}
+
+#define GEN9_MI_PREDICATE_length_bias 0x00000001
+#define GEN9_MI_PREDICATE_header                \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 12
+
+#define GEN9_MI_PREDICATE_length 0x00000001
+
+struct GEN9_MI_PREDICATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     LOAD_KEEP                                          0
+#define     LOAD_LOAD                                          2
+#define     LOAD_LOADINV                                       3
+   uint32_t                                     LoadOperation;
+#define     COMBINE_SET                                        0
+#define     COMBINE_AND                                        1
+#define     COMBINE_OR                                         2
+#define     COMBINE_XOR                                        3
+   uint32_t                                     CombineOperation;
+#define     COMPARE_SRCS_EQUAL                                 2
+#define     COMPARE_DELTAS_EQUAL                               3
+   uint32_t                                     CompareOperation;
+};
+
+static inline void
+GEN9_MI_PREDICATE_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN9_MI_PREDICATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->LoadOperation, 6, 7) |
+      __gen_field(values->CombineOperation, 3, 4) |
+      __gen_field(values->CompareOperation, 0, 1) |
+      0;
+
+}
+
+#define GEN9_MI_REPORT_HEAD_length_bias 0x00000001
+#define GEN9_MI_REPORT_HEAD_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  7
+
+#define GEN9_MI_REPORT_HEAD_length 0x00000001
+
+struct GEN9_MI_REPORT_HEAD {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN9_MI_REPORT_HEAD_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN9_MI_REPORT_HEAD * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN9_MI_RS_CONTEXT_length_bias 0x00000001
+#define GEN9_MI_RS_CONTEXT_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 15
+
+#define GEN9_MI_RS_CONTEXT_length 0x00000001
+
+struct GEN9_MI_RS_CONTEXT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     RS_RESTORE                                         0
+#define     RS_SAVE                                            1
+   uint32_t                                     ResourceStreamerSave;
+};
+
+static inline void
+GEN9_MI_RS_CONTEXT_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN9_MI_RS_CONTEXT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ResourceStreamerSave, 0, 0) |
+      0;
+
+}
+
+#define GEN9_MI_RS_CONTROL_length_bias 0x00000001
+#define GEN9_MI_RS_CONTROL_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  6
+
+#define GEN9_MI_RS_CONTROL_length 0x00000001
+
+struct GEN9_MI_RS_CONTROL {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     RS_STOP                                            0
+#define     RS_START                                           1
+   uint32_t                                     ResourceStreamerControl;
+};
+
+static inline void
+GEN9_MI_RS_CONTROL_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN9_MI_RS_CONTROL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->ResourceStreamerControl, 0, 0) |
+      0;
+
+}
+
+#define GEN9_MI_RS_STORE_DATA_IMM_length_bias 0x00000002
+#define GEN9_MI_RS_STORE_DATA_IMM_header        \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 43,                  \
+   .DwordLength          =  2
+
+#define GEN9_MI_RS_STORE_DATA_IMM_length 0x00000004
+
+struct GEN9_MI_RS_STORE_DATA_IMM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           DestinationAddress;
+   uint32_t                                     CoreModeEnable;
+   uint32_t                                     DataDWord0;
+};
+
+static inline void
+GEN9_MI_RS_STORE_DATA_IMM_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_MI_RS_STORE_DATA_IMM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->CoreModeEnable, 0, 0) |
+      0;
+
+   uint64_t qw1 =
+      __gen_combine_address(data, &dw[1], values->DestinationAddress, dw1);
+
+   dw[1] = qw1;
+   dw[2] = qw1 >> 32;
+
+   dw[3] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MI_SET_CONTEXT_length_bias 0x00000002
+#define GEN9_MI_SET_CONTEXT_header              \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 24,                  \
+   .DwordLength          =  0
+
+#define GEN9_MI_SET_CONTEXT_length 0x00000002
+
+struct GEN9_MI_SET_CONTEXT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           LogicalContextAddress;
+   uint32_t                                     ReservedMustbe1;
+   bool                                         CoreModeEnable;
+   bool                                         ResourceStreamerStateSaveEnable;
+   bool                                         ResourceStreamerStateRestoreEnable;
+   uint32_t                                     ForceRestore;
+   uint32_t                                     RestoreInhibit;
+};
+
+static inline void
+GEN9_MI_SET_CONTEXT_pack(__gen_user_data *data, void * restrict dst,
+                         const struct GEN9_MI_SET_CONTEXT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   uint32_t dw1 =
+      __gen_field(values->ReservedMustbe1, 8, 8) |
+      __gen_field(values->CoreModeEnable, 4, 4) |
+      __gen_field(values->ResourceStreamerStateSaveEnable, 3, 3) |
+      __gen_field(values->ResourceStreamerStateRestoreEnable, 2, 2) |
+      __gen_field(values->ForceRestore, 1, 1) |
+      __gen_field(values->RestoreInhibit, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->LogicalContextAddress, dw1);
+
+}
+
+#define GEN9_MI_SET_PREDICATE_length_bias 0x00000001
+#define GEN9_MI_SET_PREDICATE_header            \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  1
+
+#define GEN9_MI_SET_PREDICATE_length 0x00000001
+
+struct GEN9_MI_SET_PREDICATE {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+#define     NOOPNever                                          0
+#define     NOOPonResult2clear                                 1
+#define     NOOPonResult2set                                   2
+#define     NOOPonResultclear                                  3
+#define     NOOPonResultset                                    4
+#define     Executewhenonesliceenabled                         5
+#define     Executewhentwoslicesareenabled                     6
+#define     Executewhenthreeslicesareenabled                   7
+#define     NOOPAlways                                        15
+   uint32_t                                     PREDICATEENABLE;
+};
+
+static inline void
+GEN9_MI_SET_PREDICATE_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_MI_SET_PREDICATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->PREDICATEENABLE, 0, 3) |
+      0;
+
+}
+
+#define GEN9_MI_STORE_DATA_INDEX_length_bias 0x00000002
+#define GEN9_MI_STORE_DATA_INDEX_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 33,                  \
+   .DwordLength          =  1
+
+#define GEN9_MI_STORE_DATA_INDEX_length 0x00000003
+
+struct GEN9_MI_STORE_DATA_INDEX {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     UsePerProcessHardwareStatusPage;
+   uint32_t                                     DwordLength;
+   uint32_t                                     Offset;
+   uint32_t                                     DataDWord0;
+   uint32_t                                     DataDWord1;
+};
+
+static inline void
+GEN9_MI_STORE_DATA_INDEX_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_MI_STORE_DATA_INDEX * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->UsePerProcessHardwareStatusPage, 21, 21) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Offset, 2, 11) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DataDWord0, 0, 31) |
+      0;
+
+   dw[3] =
+      __gen_field(values->DataDWord1, 0, 31) |
+      0;
+
+}
+
+#define GEN9_MI_STORE_URB_MEM_length_bias 0x00000002
+#define GEN9_MI_STORE_URB_MEM_header            \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 45,                  \
+   .DwordLength          =  2
+
+#define GEN9_MI_STORE_URB_MEM_length 0x00000004
+
+struct GEN9_MI_STORE_URB_MEM {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   uint32_t                                     URBAddress;
+   __gen_address_type                           MemoryAddress;
+};
+
+static inline void
+GEN9_MI_STORE_URB_MEM_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_MI_STORE_URB_MEM * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->URBAddress, 2, 14) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->MemoryAddress, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+}
+
+#define GEN9_MI_SUSPEND_FLUSH_length_bias 0x00000001
+#define GEN9_MI_SUSPEND_FLUSH_header            \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 11
+
+#define GEN9_MI_SUSPEND_FLUSH_length 0x00000001
+
+struct GEN9_MI_SUSPEND_FLUSH {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         SuspendFlush;
+};
+
+static inline void
+GEN9_MI_SUSPEND_FLUSH_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_MI_SUSPEND_FLUSH * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->SuspendFlush, 0, 0) |
+      0;
+
+}
+
+#define GEN9_MI_TOPOLOGY_FILTER_length_bias 0x00000001
+#define GEN9_MI_TOPOLOGY_FILTER_header          \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 13
+
+#define GEN9_MI_TOPOLOGY_FILTER_length 0x00000001
+
+struct GEN9_MI_TOPOLOGY_FILTER {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     TopologyFilterValue;
+};
+
+static inline void
+GEN9_MI_TOPOLOGY_FILTER_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN9_MI_TOPOLOGY_FILTER * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->TopologyFilterValue, 0, 5) |
+      0;
+
+}
+
+#define GEN9_MI_UPDATE_GTT_length_bias 0x00000002
+#define GEN9_MI_UPDATE_GTT_header               \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      = 35
+
+#define GEN9_MI_UPDATE_GTT_length 0x00000000
+
+struct GEN9_MI_UPDATE_GTT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     DwordLength;
+   __gen_address_type                           EntryAddress;
+   /* variable length fields follow */
+};
+
+static inline void
+GEN9_MI_UPDATE_GTT_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN9_MI_UPDATE_GTT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DwordLength, 0, 9) |
+      0;
+
+   uint32_t dw1 =
+      0;
+
+   dw[1] =
+      __gen_combine_address(data, &dw[1], values->EntryAddress, dw1);
+
+   /* variable length fields follow */
+}
+
+#define GEN9_MI_URB_ATOMIC_ALLOC_length_bias 0x00000001
+#define GEN9_MI_URB_ATOMIC_ALLOC_header         \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  9
+
+#define GEN9_MI_URB_ATOMIC_ALLOC_length 0x00000001
+
+struct GEN9_MI_URB_ATOMIC_ALLOC {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   uint32_t                                     URBAtomicStorageOffset;
+   uint32_t                                     URBAtomicStorageSize;
+};
+
+static inline void
+GEN9_MI_URB_ATOMIC_ALLOC_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_MI_URB_ATOMIC_ALLOC * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->URBAtomicStorageOffset, 12, 19) |
+      __gen_field(values->URBAtomicStorageSize, 0, 8) |
+      0;
+
+}
+
+#define GEN9_MI_USER_INTERRUPT_length_bias 0x00000001
+#define GEN9_MI_USER_INTERRUPT_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  2
+
+#define GEN9_MI_USER_INTERRUPT_length 0x00000001
+
+struct GEN9_MI_USER_INTERRUPT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+};
+
+static inline void
+GEN9_MI_USER_INTERRUPT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_MI_USER_INTERRUPT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      0;
+
+}
+
+#define GEN9_MI_WAIT_FOR_EVENT_length_bias 0x00000001
+#define GEN9_MI_WAIT_FOR_EVENT_header           \
+   .CommandType          =  0,                  \
+   .MICommandOpcode      =  3
+
+#define GEN9_MI_WAIT_FOR_EVENT_length 0x00000001
+
+struct GEN9_MI_WAIT_FOR_EVENT {
+   uint32_t                                     CommandType;
+   uint32_t                                     MICommandOpcode;
+   bool                                         DisplayPlane1CVerticalBlankWaitEnable;
+   bool                                         DisplayPlane6FlipPendingWaitEnable;
+   bool                                         DisplayPlane12FlipPendingWaitEnable;
+   bool                                         DisplayPlane11FlipPendingWaitEnable;
+   bool                                         DisplayPlane10FlipPendingWaitEnable;
+   bool                                         DisplayPlane9FlipPendingWaitEnable;
+   bool                                         DisplayPlane3FlipPendingWaitEnable;
+   bool                                         DisplayPlane1CScanLineWaitEnable;
+   bool                                         DisplayPlane1BVerticalBlankWaitEnable;
+   bool                                         DisplayPlane5FlipPendingWaitEnable;
+   bool                                         DisplayPlane2FlipPendingWaitEnable;
+   bool                                         DisplayPlane1BScanLineWaitEnable;
+   bool                                         DisplayPlane8FlipPendingWaitEnable;
+   bool                                         DisplayPlane7FlipPendingWaitEnable;
+   bool                                         DisplayPlane1AVerticalBlankWaitEnable;
+   bool                                         DisplayPlane4FlipPendingWaitEnable;
+   bool                                         DisplayPlane1FlipPendingWaitEnable;
+   bool                                         DisplayPlnae1AScanLineWaitEnable;
+};
+
+static inline void
+GEN9_MI_WAIT_FOR_EVENT_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_MI_WAIT_FOR_EVENT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->MICommandOpcode, 23, 28) |
+      __gen_field(values->DisplayPlane1CVerticalBlankWaitEnable, 21, 21) |
+      __gen_field(values->DisplayPlane6FlipPendingWaitEnable, 20, 20) |
+      __gen_field(values->DisplayPlane12FlipPendingWaitEnable, 19, 19) |
+      __gen_field(values->DisplayPlane11FlipPendingWaitEnable, 18, 18) |
+      __gen_field(values->DisplayPlane10FlipPendingWaitEnable, 17, 17) |
+      __gen_field(values->DisplayPlane9FlipPendingWaitEnable, 16, 16) |
+      __gen_field(values->DisplayPlane3FlipPendingWaitEnable, 15, 15) |
+      __gen_field(values->DisplayPlane1CScanLineWaitEnable, 14, 14) |
+      __gen_field(values->DisplayPlane1BVerticalBlankWaitEnable, 11, 11) |
+      __gen_field(values->DisplayPlane5FlipPendingWaitEnable, 10, 10) |
+      __gen_field(values->DisplayPlane2FlipPendingWaitEnable, 9, 9) |
+      __gen_field(values->DisplayPlane1BScanLineWaitEnable, 8, 8) |
+      __gen_field(values->DisplayPlane8FlipPendingWaitEnable, 7, 7) |
+      __gen_field(values->DisplayPlane7FlipPendingWaitEnable, 6, 6) |
+      __gen_field(values->DisplayPlane1AVerticalBlankWaitEnable, 3, 3) |
+      __gen_field(values->DisplayPlane4FlipPendingWaitEnable, 2, 2) |
+      __gen_field(values->DisplayPlane1FlipPendingWaitEnable, 1, 1) |
+      __gen_field(values->DisplayPlnae1AScanLineWaitEnable, 0, 0) |
+      0;
+
+}
+
+#define GEN9_PIPE_CONTROL_length_bias 0x00000002
+#define GEN9_PIPE_CONTROL_header                \
+   .CommandType          =  3,                  \
+   .CommandSubType       =  3,                  \
+   ._3DCommandOpcode     =  2,                  \
+   ._3DCommandSubOpcode  =  0,                  \
+   .DwordLength          =  4
+
+#define GEN9_PIPE_CONTROL_length 0x00000006
+
+struct GEN9_PIPE_CONTROL {
+   uint32_t                                     CommandType;
+   uint32_t                                     CommandSubType;
+   uint32_t                                     _3DCommandOpcode;
+   uint32_t                                     _3DCommandSubOpcode;
+   uint32_t                                     DwordLength;
+   bool                                         FlushLLC;
+#define     DAT_PPGTT                                          0
+#define     DAT_GGTT                                           1
+   uint32_t                                     DestinationAddressType;
+#define     NoLRIOperation                                     0
+#define     MMIOWriteImmediateData                             1
+   uint32_t                                     LRIPostSyncOperation;
+   uint32_t                                     StoreDataIndex;
+   uint32_t                                     CommandStreamerStallEnable;
+#define     DontReset                                          0
+#define     Reset                                              1
+   uint32_t                                     GlobalSnapshotCountReset;
+   uint32_t                                     TLBInvalidate;
+   bool                                         GenericMediaStateClear;
+#define     NoWrite                                            0
+#define     WriteImmediateData                                 1
+#define     WritePSDepthCount                                  2
+#define     WriteTimestamp                                     3
+   uint32_t                                     PostSyncOperation;
+   bool                                         DepthStallEnable;
+#define     DisableFlush                                       0
+#define     EnableFlush                                        1
+   bool                                         RenderTargetCacheFlushEnable;
+   bool                                         InstructionCacheInvalidateEnable;
+   bool                                         TextureCacheInvalidationEnable;
+   bool                                         IndirectStatePointersDisable;
+   bool                                         NotifyEnable;
+   bool                                         PipeControlFlushEnable;
+   bool                                         DCFlushEnable;
+   bool                                         VFCacheInvalidationEnable;
+   bool                                         ConstantCacheInvalidationEnable;
+   bool                                         StateCacheInvalidationEnable;
+   bool                                         StallAtPixelScoreboard;
+#define     FlushDisabled                                      0
+#define     FlushEnabled                                       1
+   bool                                         DepthCacheFlushEnable;
+   __gen_address_type                           Address;
+   uint64_t                                     ImmediateData;
+};
+
+static inline void
+GEN9_PIPE_CONTROL_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN9_PIPE_CONTROL * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->CommandType, 29, 31) |
+      __gen_field(values->CommandSubType, 27, 28) |
+      __gen_field(values->_3DCommandOpcode, 24, 26) |
+      __gen_field(values->_3DCommandSubOpcode, 16, 23) |
+      __gen_field(values->DwordLength, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->FlushLLC, 26, 26) |
+      __gen_field(values->DestinationAddressType, 24, 24) |
+      __gen_field(values->LRIPostSyncOperation, 23, 23) |
+      __gen_field(values->StoreDataIndex, 21, 21) |
+      __gen_field(values->CommandStreamerStallEnable, 20, 20) |
+      __gen_field(values->GlobalSnapshotCountReset, 19, 19) |
+      __gen_field(values->TLBInvalidate, 18, 18) |
+      __gen_field(values->GenericMediaStateClear, 16, 16) |
+      __gen_field(values->PostSyncOperation, 14, 15) |
+      __gen_field(values->DepthStallEnable, 13, 13) |
+      __gen_field(values->RenderTargetCacheFlushEnable, 12, 12) |
+      __gen_field(values->InstructionCacheInvalidateEnable, 11, 11) |
+      __gen_field(values->TextureCacheInvalidationEnable, 10, 10) |
+      __gen_field(values->IndirectStatePointersDisable, 9, 9) |
+      __gen_field(values->NotifyEnable, 8, 8) |
+      __gen_field(values->PipeControlFlushEnable, 7, 7) |
+      __gen_field(values->DCFlushEnable, 5, 5) |
+      __gen_field(values->VFCacheInvalidationEnable, 4, 4) |
+      __gen_field(values->ConstantCacheInvalidationEnable, 3, 3) |
+      __gen_field(values->StateCacheInvalidationEnable, 2, 2) |
+      __gen_field(values->StallAtPixelScoreboard, 1, 1) |
+      __gen_field(values->DepthCacheFlushEnable, 0, 0) |
+      0;
+
+   uint32_t dw2 =
+      0;
+
+   uint64_t qw2 =
+      __gen_combine_address(data, &dw[2], values->Address, dw2);
+
+   dw[2] = qw2;
+   dw[3] = qw2 >> 32;
+
+   uint64_t qw4 =
+      __gen_field(values->ImmediateData, 0, 63) |
+      0;
+
+   dw[4] = qw4;
+   dw[5] = qw4 >> 32;
+
+}
+
+#define GEN9_SCISSOR_RECT_length 0x00000002
+
+struct GEN9_SCISSOR_RECT {
+   uint32_t                                     ScissorRectangleYMin;
+   uint32_t                                     ScissorRectangleXMin;
+   uint32_t                                     ScissorRectangleYMax;
+   uint32_t                                     ScissorRectangleXMax;
+};
+
+static inline void
+GEN9_SCISSOR_RECT_pack(__gen_user_data *data, void * restrict dst,
+                       const struct GEN9_SCISSOR_RECT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ScissorRectangleYMin, 16, 31) |
+      __gen_field(values->ScissorRectangleXMin, 0, 15) |
+      0;
+
+   dw[1] =
+      __gen_field(values->ScissorRectangleYMax, 16, 31) |
+      __gen_field(values->ScissorRectangleXMax, 0, 15) |
+      0;
+
+}
+
+#define GEN9_SF_CLIP_VIEWPORT_length 0x00000010
+
+struct GEN9_SF_CLIP_VIEWPORT {
+   float                                        ViewportMatrixElementm00;
+   float                                        ViewportMatrixElementm11;
+   float                                        ViewportMatrixElementm22;
+   float                                        ViewportMatrixElementm30;
+   float                                        ViewportMatrixElementm31;
+   float                                        ViewportMatrixElementm32;
+   float                                        XMinClipGuardband;
+   float                                        XMaxClipGuardband;
+   float                                        YMinClipGuardband;
+   float                                        YMaxClipGuardband;
+   float                                        XMinViewPort;
+   float                                        XMaxViewPort;
+   float                                        YMinViewPort;
+   float                                        YMaxViewPort;
+};
+
+static inline void
+GEN9_SF_CLIP_VIEWPORT_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_SF_CLIP_VIEWPORT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->ViewportMatrixElementm00) |
+      0;
+
+   dw[1] =
+      __gen_float(values->ViewportMatrixElementm11) |
+      0;
+
+   dw[2] =
+      __gen_float(values->ViewportMatrixElementm22) |
+      0;
+
+   dw[3] =
+      __gen_float(values->ViewportMatrixElementm30) |
+      0;
+
+   dw[4] =
+      __gen_float(values->ViewportMatrixElementm31) |
+      0;
+
+   dw[5] =
+      __gen_float(values->ViewportMatrixElementm32) |
+      0;
+
+   dw[6] =
+      0;
+
+   dw[7] =
+      0;
+
+   dw[8] =
+      __gen_float(values->XMinClipGuardband) |
+      0;
+
+   dw[9] =
+      __gen_float(values->XMaxClipGuardband) |
+      0;
+
+   dw[10] =
+      __gen_float(values->YMinClipGuardband) |
+      0;
+
+   dw[11] =
+      __gen_float(values->YMaxClipGuardband) |
+      0;
+
+   dw[12] =
+      __gen_float(values->XMinViewPort) |
+      0;
+
+   dw[13] =
+      __gen_float(values->XMaxViewPort) |
+      0;
+
+   dw[14] =
+      __gen_float(values->YMinViewPort) |
+      0;
+
+   dw[15] =
+      __gen_float(values->YMaxViewPort) |
+      0;
+
+}
+
+#define GEN9_BLEND_STATE_length 0x00000011
+
+#define GEN9_BLEND_STATE_ENTRY_length 0x00000002
+
+struct GEN9_BLEND_STATE_ENTRY {
+   bool                                         LogicOpEnable;
+   uint32_t                                     LogicOpFunction;
+   uint32_t                                     PreBlendSourceOnlyClampEnable;
+#define     COLORCLAMP_UNORM                                   0
+#define     COLORCLAMP_SNORM                                   1
+#define     COLORCLAMP_RTFORMAT                                2
+   uint32_t                                     ColorClampRange;
+   bool                                         PreBlendColorClampEnable;
+   bool                                         PostBlendColorClampEnable;
+   bool                                         ColorBufferBlendEnable;
+   uint32_t                                     SourceBlendFactor;
+   uint32_t                                     DestinationBlendFactor;
+   uint32_t                                     ColorBlendFunction;
+   uint32_t                                     SourceAlphaBlendFactor;
+   uint32_t                                     DestinationAlphaBlendFactor;
+   uint32_t                                     AlphaBlendFunction;
+   bool                                         WriteDisableAlpha;
+   bool                                         WriteDisableRed;
+   bool                                         WriteDisableGreen;
+   bool                                         WriteDisableBlue;
+};
+
+static inline void
+GEN9_BLEND_STATE_ENTRY_pack(__gen_user_data *data, void * restrict dst,
+                            const struct GEN9_BLEND_STATE_ENTRY * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   uint64_t qw0 =
+      __gen_field(values->LogicOpEnable, 63, 63) |
+      __gen_field(values->LogicOpFunction, 59, 62) |
+      __gen_field(values->PreBlendSourceOnlyClampEnable, 36, 36) |
+      __gen_field(values->ColorClampRange, 34, 35) |
+      __gen_field(values->PreBlendColorClampEnable, 33, 33) |
+      __gen_field(values->PostBlendColorClampEnable, 32, 32) |
+      __gen_field(values->ColorBufferBlendEnable, 31, 31) |
+      __gen_field(values->SourceBlendFactor, 26, 30) |
+      __gen_field(values->DestinationBlendFactor, 21, 25) |
+      __gen_field(values->ColorBlendFunction, 18, 20) |
+      __gen_field(values->SourceAlphaBlendFactor, 13, 17) |
+      __gen_field(values->DestinationAlphaBlendFactor, 8, 12) |
+      __gen_field(values->AlphaBlendFunction, 5, 7) |
+      __gen_field(values->WriteDisableAlpha, 3, 3) |
+      __gen_field(values->WriteDisableRed, 2, 2) |
+      __gen_field(values->WriteDisableGreen, 1, 1) |
+      __gen_field(values->WriteDisableBlue, 0, 0) |
+      0;
+
+   dw[0] = qw0;
+   dw[1] = qw0 >> 32;
+
+}
+
+struct GEN9_BLEND_STATE {
+   bool                                         AlphaToCoverageEnable;
+   bool                                         IndependentAlphaBlendEnable;
+   bool                                         AlphaToOneEnable;
+   bool                                         AlphaToCoverageDitherEnable;
+   bool                                         AlphaTestEnable;
+   uint32_t                                     AlphaTestFunction;
+   bool                                         ColorDitherEnable;
+   uint32_t                                     XDitherOffset;
+   uint32_t                                     YDitherOffset;
+   struct GEN9_BLEND_STATE_ENTRY                Entry[8];
+};
+
+static inline void
+GEN9_BLEND_STATE_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN9_BLEND_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->AlphaToCoverageEnable, 31, 31) |
+      __gen_field(values->IndependentAlphaBlendEnable, 30, 30) |
+      __gen_field(values->AlphaToOneEnable, 29, 29) |
+      __gen_field(values->AlphaToCoverageDitherEnable, 28, 28) |
+      __gen_field(values->AlphaTestEnable, 27, 27) |
+      __gen_field(values->AlphaTestFunction, 24, 26) |
+      __gen_field(values->ColorDitherEnable, 23, 23) |
+      __gen_field(values->XDitherOffset, 21, 22) |
+      __gen_field(values->YDitherOffset, 19, 20) |
+      0;
+
+   for (uint32_t i = 0, j = 1; i < 8; i++, j += 2)
+      GEN9_BLEND_STATE_ENTRY_pack(data, &dw[j], &values->Entry[i]);
+}
+
+#define GEN9_CC_VIEWPORT_length 0x00000002
+
+struct GEN9_CC_VIEWPORT {
+   float                                        MinimumDepth;
+   float                                        MaximumDepth;
+};
+
+static inline void
+GEN9_CC_VIEWPORT_pack(__gen_user_data *data, void * restrict dst,
+                      const struct GEN9_CC_VIEWPORT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_float(values->MinimumDepth) |
+      0;
+
+   dw[1] =
+      __gen_float(values->MaximumDepth) |
+      0;
+
+}
+
+#define GEN9_COLOR_CALC_STATE_length 0x00000006
+
+struct GEN9_COLOR_CALC_STATE {
+#define     Cancelled                                          0
+#define     NotCancelled                                       1
+   uint32_t                                     RoundDisableFunctionDisable;
+#define     ALPHATEST_UNORM8                                   0
+#define     ALPHATEST_FLOAT32                                  1
+   uint32_t                                     AlphaTestFormat;
+   uint32_t                                     AlphaReferenceValueAsUNORM8;
+   float                                        AlphaReferenceValueAsFLOAT32;
+   float                                        BlendConstantColorRed;
+   float                                        BlendConstantColorGreen;
+   float                                        BlendConstantColorBlue;
+   float                                        BlendConstantColorAlpha;
+};
+
+static inline void
+GEN9_COLOR_CALC_STATE_pack(__gen_user_data *data, void * restrict dst,
+                           const struct GEN9_COLOR_CALC_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->RoundDisableFunctionDisable, 15, 15) |
+      __gen_field(values->AlphaTestFormat, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_field(values->AlphaReferenceValueAsUNORM8, 0, 31) |
+      __gen_float(values->AlphaReferenceValueAsFLOAT32) |
+      0;
+
+   dw[2] =
+      __gen_float(values->BlendConstantColorRed) |
+      0;
+
+   dw[3] =
+      __gen_float(values->BlendConstantColorGreen) |
+      0;
+
+   dw[4] =
+      __gen_float(values->BlendConstantColorBlue) |
+      0;
+
+   dw[5] =
+      __gen_float(values->BlendConstantColorAlpha) |
+      0;
+
+}
+
+#define GEN9_EXECUTION_UNIT_EXTENDED_MESSAGE_DESCRIPTOR_length 0x00000001
+
+struct GEN9_EXECUTION_UNIT_EXTENDED_MESSAGE_DESCRIPTOR {
+   uint32_t                                     ExtendedMessageLength;
+#define     NoTermination                                      0
+#define     EOT                                                1
+   uint32_t                                     EndOfThread;
+   uint32_t                                     TargetFunctionID;
+};
+
+static inline void
+GEN9_EXECUTION_UNIT_EXTENDED_MESSAGE_DESCRIPTOR_pack(__gen_user_data *data, void * restrict dst,
+                                                     const struct GEN9_EXECUTION_UNIT_EXTENDED_MESSAGE_DESCRIPTOR * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->ExtendedMessageLength, 6, 9) |
+      __gen_field(values->EndOfThread, 5, 5) |
+      __gen_field(values->TargetFunctionID, 0, 3) |
+      0;
+
+}
+
+#define GEN9_INTERFACE_DESCRIPTOR_DATA_length 0x00000008
+
+struct GEN9_INTERFACE_DESCRIPTOR_DATA {
+   uint32_t                                     KernelStartPointer;
+   uint32_t                                     KernelStartPointerHigh;
+#define     Ftz                                                0
+#define     SetByKernel                                        1
+   uint32_t                                     DenormMode;
+#define     Multiple                                           0
+#define     Single                                             1
+   uint32_t                                     SingleProgramFlow;
+#define     NormalPriority                                     0
+#define     HighPriority                                       1
+   uint32_t                                     ThreadPriority;
+#define     IEEE754                                            0
+#define     Alternate                                          1
+   uint32_t                                     FloatingPointMode;
+   bool                                         IllegalOpcodeExceptionEnable;
+   bool                                         MaskStackExceptionEnable;
+   bool                                         SoftwareExceptionEnable;
+   uint32_t                                     SamplerStatePointer;
+#define     Nosamplersused                                     0
+#define     Between1and4samplersused                           1
+#define     Between5and8samplersused                           2
+#define     Between9and12samplersused                          3
+#define     Between13and16samplersused                         4
+   uint32_t                                     SamplerCount;
+   uint32_t                                     BindingTablePointer;
+   uint32_t                                     BindingTableEntryCount;
+   uint32_t                                     ConstantIndirectURBEntryReadLength;
+   uint32_t                                     ConstantURBEntryReadOffset;
+#define     RTNE                                               0
+#define     RU                                                 1
+#define     RD                                                 2
+#define     RTZ                                                3
+   uint32_t                                     RoundingMode;
+   bool                                         BarrierEnable;
+#define     Encodes0K                                          0
+#define     Encodes1K                                          1
+#define     Encodes2K                                          2
+#define     Encodes4K                                          3
+#define     Encodes8K                                          4
+#define     Encodes16K                                         5
+#define     Encodes32K                                         6
+#define     Encodes64K                                         7
+   uint32_t                                     SharedLocalMemorySize;
+   bool                                         GlobalBarrierEnable;
+   uint32_t                                     NumberofThreadsinGPGPUThreadGroup;
+   uint32_t                                     CrossThreadConstantDataReadLength;
+};
+
+static inline void
+GEN9_INTERFACE_DESCRIPTOR_DATA_pack(__gen_user_data *data, void * restrict dst,
+                                    const struct GEN9_INTERFACE_DESCRIPTOR_DATA * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->KernelStartPointer, 6, 31) |
+      0;
+
+   dw[1] =
+      __gen_offset(values->KernelStartPointerHigh, 0, 15) |
+      0;
+
+   dw[2] =
+      __gen_field(values->DenormMode, 19, 19) |
+      __gen_field(values->SingleProgramFlow, 18, 18) |
+      __gen_field(values->ThreadPriority, 17, 17) |
+      __gen_field(values->FloatingPointMode, 16, 16) |
+      __gen_field(values->IllegalOpcodeExceptionEnable, 13, 13) |
+      __gen_field(values->MaskStackExceptionEnable, 11, 11) |
+      __gen_field(values->SoftwareExceptionEnable, 7, 7) |
+      0;
+
+   dw[3] =
+      __gen_offset(values->SamplerStatePointer, 5, 31) |
+      __gen_field(values->SamplerCount, 2, 4) |
+      0;
+
+   dw[4] =
+      __gen_offset(values->BindingTablePointer, 5, 15) |
+      __gen_field(values->BindingTableEntryCount, 0, 4) |
+      0;
+
+   dw[5] =
+      __gen_field(values->ConstantIndirectURBEntryReadLength, 16, 31) |
+      __gen_field(values->ConstantURBEntryReadOffset, 0, 15) |
+      0;
+
+   dw[6] =
+      __gen_field(values->RoundingMode, 22, 23) |
+      __gen_field(values->BarrierEnable, 21, 21) |
+      __gen_field(values->SharedLocalMemorySize, 16, 20) |
+      __gen_field(values->GlobalBarrierEnable, 15, 15) |
+      __gen_field(values->NumberofThreadsinGPGPUThreadGroup, 0, 9) |
+      0;
+
+   dw[7] =
+      __gen_field(values->CrossThreadConstantDataReadLength, 0, 7) |
+      0;
+
+}
+
+#define GEN9_ROUNDINGPRECISIONTABLE_3_BITS_length 0x00000001
+
+struct GEN9_ROUNDINGPRECISIONTABLE_3_BITS {
+#define     _116                                               0
+#define     _216                                               1
+#define     _316                                               2
+#define     _416                                               3
+#define     _516                                               4
+#define     _616                                               5
+#define     _716                                               6
+#define     _816                                               7
+   uint32_t                                     RoundingPrecision;
+};
+
+static inline void
+GEN9_ROUNDINGPRECISIONTABLE_3_BITS_pack(__gen_user_data *data, void * restrict dst,
+                                        const struct GEN9_ROUNDINGPRECISIONTABLE_3_BITS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->RoundingPrecision, 0, 2) |
+      0;
+
+}
+
+#define GEN9_BINDING_TABLE_STATE_length 0x00000001
+
+struct GEN9_BINDING_TABLE_STATE {
+   uint32_t                                     SurfaceStatePointer;
+};
+
+static inline void
+GEN9_BINDING_TABLE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                              const struct GEN9_BINDING_TABLE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_offset(values->SurfaceStatePointer, 6, 31) |
+      0;
+
+}
+
+#define GEN9_RENDER_SURFACE_STATE_length 0x00000010
+
+struct GEN9_RENDER_SURFACE_STATE {
+#define     SURFTYPE_1D                                        0
+#define     SURFTYPE_2D                                        1
+#define     SURFTYPE_3D                                        2
+#define     SURFTYPE_CUBE                                      3
+#define     SURFTYPE_BUFFER                                    4
+#define     SURFTYPE_STRBUF                                    5
+#define     SURFTYPE_NULL                                      7
+   uint32_t                                     SurfaceType;
+   bool                                         SurfaceArray;
+   bool                                         ASTC_Enable;
+   uint32_t                                     SurfaceFormat;
+#define     VALIGN4                                            1
+#define     VALIGN8                                            2
+#define     VALIGN16                                           3
+   uint32_t                                     SurfaceVerticalAlignment;
+#define     HALIGN4                                            1
+#define     HALIGN8                                            2
+#define     HALIGN16                                           3
+   uint32_t                                     SurfaceHorizontalAlignment;
+#define     LINEAR                                             0
+#define     WMAJOR                                             1
+#define     XMAJOR                                             2
+#define     YMAJOR                                             3
+   uint32_t                                     TileMode;
+   uint32_t                                     VerticalLineStride;
+   uint32_t                                     VerticalLineStrideOffset;
+   bool                                         SamplerL2BypassModeDisable;
+#define     WriteOnlyCache                                     0
+#define     ReadWriteCache                                     1
+   uint32_t                                     RenderCacheReadWriteMode;
+#define     NORMAL_MODE                                        0
+#define     PROGRESSIVE_FRAME                                  2
+#define     INTERLACED_FRAME                                   3
+   uint32_t                                     MediaBoundaryPixelMode;
+   bool                                         CubeFaceEnablePositiveZ;
+   bool                                         CubeFaceEnableNegativeZ;
+   bool                                         CubeFaceEnablePositiveY;
+   bool                                         CubeFaceEnableNegativeY;
+   bool                                         CubeFaceEnablePositiveX;
+   bool                                         CubeFaceEnableNegativeX;
+   struct GEN9_MEMORY_OBJECT_CONTROL_STATE      MemoryObjectControlState;
+   float                                        BaseMipLevel;
+   uint32_t                                     SurfaceQPitch;
+   uint32_t                                     Height;
+   uint32_t                                     Width;
+   uint32_t                                     Depth;
+   uint32_t                                     SurfacePitch;
+#define     _0DEG                                              0
+#define     _90DEG                                             1
+#define     _180DEG                                            2
+#define     _270DEG                                            3
+   uint32_t                                     RenderTargetAndSampleUnormRotation;
+   uint32_t                                     MinimumArrayElement;
+   uint32_t                                     RenderTargetViewExtent;
+#define     MSS                                                0
+#define     DEPTH_STENCIL                                      1
+   uint32_t                                     MultisampledSurfaceStorageFormat;
+#define     MULTISAMPLECOUNT_1                                 0
+#define     MULTISAMPLECOUNT_2                                 1
+#define     MULTISAMPLECOUNT_4                                 2
+#define     MULTISAMPLECOUNT_8                                 3
+#define     MULTISAMPLECOUNT_16                                4
+   uint32_t                                     NumberofMultisamples;
+   uint32_t                                     MultisamplePositionPaletteIndex;
+   uint32_t                                     XOffset;
+   uint32_t                                     YOffset;
+   bool                                         EWADisableForCube;
+#define     NONE                                               0
+#define     _4KB                                               1
+#define     _64KB                                              2
+#define     TILEYF                                             1
+#define     TILEYS                                             2
+   uint32_t                                     TiledResourceMode;
+#define     GPUcoherent                                        0
+#define     IAcoherent                                         1
+   uint32_t                                     CoherencyType;
+   uint32_t                                     MipTailStartLOD;
+   uint32_t                                     SurfaceMinLOD;
+   uint32_t                                     MIPCountLOD;
+   uint32_t                                     AuxiliarySurfaceQPitch;
+   uint32_t                                     AuxiliarySurfacePitch;
+#define     AUX_NONE                                           0
+#define     AUX_CCS_D                                          1
+#define     AUX_APPEND                                         2
+#define     AUX_HIZ                                            3
+#define     AUX_CCS_E                                          5
+   uint32_t                                     AuxiliarySurfaceMode;
+   bool                                         SeparateUVPlaneEnable;
+   uint32_t                                     XOffsetforUorUVPlane;
+   uint32_t                                     YOffsetforUorUVPlane;
+#define     Horizontal                                         0
+#define     Vertical                                           1
+   uint32_t                                     MemoryCompressionMode;
+   bool                                         MemoryCompressionEnable;
+   uint32_t                                     ShaderChannelSelectRed;
+   uint32_t                                     ShaderChannelSelectGreen;
+   uint32_t                                     ShaderChannelSelectBlue;
+   uint32_t                                     ShaderChannelSelectAlpha;
+   float                                        ResourceMinLOD;
+   __gen_address_type                           SurfaceBaseAddress;
+   uint32_t                                     XOffsetforVPlane;
+   uint32_t                                     YOffsetforVPlane;
+   uint32_t                                     AuxiliaryTableIndexforMediaCompressedSurface;
+   __gen_address_type                           AuxiliarySurfaceBaseAddress;
+   uint32_t                                     QuiltHeight;
+   uint32_t                                     QuiltWidth;
+   float                                        HierarchicalDepthClearValue;
+   uint32_t                                     RedClearColor;
+   uint32_t                                     GreenClearColor;
+   uint32_t                                     BlueClearColor;
+   uint32_t                                     AlphaClearColor;
+};
+
+static inline void
+GEN9_RENDER_SURFACE_STATE_pack(__gen_user_data *data, void * restrict dst,
+                               const struct GEN9_RENDER_SURFACE_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->SurfaceType, 29, 31) |
+      __gen_field(values->SurfaceArray, 28, 28) |
+      __gen_field(values->ASTC_Enable, 27, 27) |
+      __gen_field(values->SurfaceFormat, 18, 26) |
+      __gen_field(values->SurfaceVerticalAlignment, 16, 17) |
+      __gen_field(values->SurfaceHorizontalAlignment, 14, 15) |
+      __gen_field(values->TileMode, 12, 13) |
+      __gen_field(values->VerticalLineStride, 11, 11) |
+      __gen_field(values->VerticalLineStrideOffset, 10, 10) |
+      __gen_field(values->SamplerL2BypassModeDisable, 9, 9) |
+      __gen_field(values->RenderCacheReadWriteMode, 8, 8) |
+      __gen_field(values->MediaBoundaryPixelMode, 6, 7) |
+      __gen_field(values->CubeFaceEnablePositiveZ, 0, 0) |
+      __gen_field(values->CubeFaceEnableNegativeZ, 1, 1) |
+      __gen_field(values->CubeFaceEnablePositiveY, 2, 2) |
+      __gen_field(values->CubeFaceEnableNegativeY, 3, 3) |
+      __gen_field(values->CubeFaceEnablePositiveX, 4, 4) |
+      __gen_field(values->CubeFaceEnableNegativeX, 5, 5) |
+      0;
+
+   uint32_t dw_MemoryObjectControlState;
+   GEN9_MEMORY_OBJECT_CONTROL_STATE_pack(data, &dw_MemoryObjectControlState, &values->MemoryObjectControlState);
+   dw[1] =
+      __gen_field(dw_MemoryObjectControlState, 24, 30) |
+      __gen_field(values->BaseMipLevel * (1 << 1), 19, 23) |
+      __gen_field(values->SurfaceQPitch, 0, 14) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Height, 16, 29) |
+      __gen_field(values->Width, 0, 13) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Depth, 21, 31) |
+      __gen_field(values->SurfacePitch, 0, 17) |
+      0;
+
+   dw[4] =
+      __gen_field(values->RenderTargetAndSampleUnormRotation, 29, 30) |
+      __gen_field(values->MinimumArrayElement, 18, 28) |
+      __gen_field(values->RenderTargetViewExtent, 7, 17) |
+      __gen_field(values->MultisampledSurfaceStorageFormat, 6, 6) |
+      __gen_field(values->NumberofMultisamples, 3, 5) |
+      __gen_field(values->MultisamplePositionPaletteIndex, 0, 2) |
+      0;
+
+   dw[5] =
+      __gen_offset(values->XOffset, 25, 31) |
+      __gen_offset(values->YOffset, 21, 23) |
+      __gen_field(values->EWADisableForCube, 20, 20) |
+      __gen_field(values->TiledResourceMode, 18, 19) |
+      __gen_field(values->CoherencyType, 14, 14) |
+      __gen_field(values->MipTailStartLOD, 8, 11) |
+      __gen_field(values->SurfaceMinLOD, 4, 7) |
+      __gen_field(values->MIPCountLOD, 0, 3) |
+      0;
+
+   dw[6] =
+      __gen_field(values->AuxiliarySurfaceQPitch, 16, 30) |
+      __gen_field(values->AuxiliarySurfacePitch, 3, 11) |
+      __gen_field(values->AuxiliarySurfaceMode, 0, 2) |
+      __gen_field(values->SeparateUVPlaneEnable, 31, 31) |
+      __gen_field(values->XOffsetforUorUVPlane, 16, 29) |
+      __gen_field(values->YOffsetforUorUVPlane, 0, 13) |
+      0;
+
+   dw[7] =
+      __gen_field(values->MemoryCompressionMode, 31, 31) |
+      __gen_field(values->MemoryCompressionEnable, 30, 30) |
+      __gen_field(values->ShaderChannelSelectRed, 25, 27) |
+      __gen_field(values->ShaderChannelSelectGreen, 22, 24) |
+      __gen_field(values->ShaderChannelSelectBlue, 19, 21) |
+      __gen_field(values->ShaderChannelSelectAlpha, 16, 18) |
+      __gen_field(values->ResourceMinLOD * (1 << 8), 0, 11) |
+      0;
+
+   uint32_t dw8 =
+      0;
+
+   uint64_t qw8 =
+      __gen_combine_address(data, &dw[8], values->SurfaceBaseAddress, dw8);
+
+   dw[8] = qw8;
+   dw[9] = qw8 >> 32;
+
+   uint32_t dw10 =
+      __gen_field(values->XOffsetforVPlane, 48, 61) |
+      __gen_field(values->YOffsetforVPlane, 32, 45) |
+      __gen_field(values->AuxiliaryTableIndexforMediaCompressedSurface, 21, 31) |
+      __gen_field(values->QuiltHeight, 5, 9) |
+      __gen_field(values->QuiltWidth, 0, 4) |
+      0;
+
+   uint64_t qw10 =
+      __gen_combine_address(data, &dw[10], values->AuxiliarySurfaceBaseAddress, dw10);
+
+   dw[10] = qw10;
+   dw[11] = qw10 >> 32;
+
+   dw[12] =
+      __gen_float(values->HierarchicalDepthClearValue) |
+      __gen_field(values->RedClearColor, 0, 31) |
+      0;
+
+   dw[13] =
+      __gen_field(values->GreenClearColor, 0, 31) |
+      0;
+
+   dw[14] =
+      __gen_field(values->BlueClearColor, 0, 31) |
+      0;
+
+   dw[15] =
+      __gen_field(values->AlphaClearColor, 0, 31) |
+      0;
+
+}
+
+#define GEN9_FILTER_COEFFICIENT_length 0x00000001
+
+struct GEN9_FILTER_COEFFICIENT {
+   uint32_t                                     FilterCoefficient;
+};
+
+static inline void
+GEN9_FILTER_COEFFICIENT_pack(__gen_user_data *data, void * restrict dst,
+                             const struct GEN9_FILTER_COEFFICIENT * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->FilterCoefficient, 0, 7) |
+      0;
+
+}
+
+#define GEN9_SAMPLER_STATE_length 0x00000004
+
+struct GEN9_SAMPLER_STATE {
+   bool                                         SamplerDisable;
+#define     DX10OGL                                            0
+#define     DX9                                                1
+   uint32_t                                     TextureBorderColorMode;
+#define     CLAMP_NONE                                         0
+#define     CLAMP_OGL                                          2
+   uint32_t                                     LODPreClampMode;
+   uint32_t                                     CoarseLODQualityMode;
+#define     MIPFILTER_NONE                                     0
+#define     MIPFILTER_NEAREST                                  1
+#define     MIPFILTER_LINEAR                                   3
+   uint32_t                                     MipModeFilter;
+#define     MAPFILTER_NEAREST                                  0
+#define     MAPFILTER_LINEAR                                   1
+#define     MAPFILTER_ANISOTROPIC                              2
+#define     MAPFILTER_MONO                                     6
+   uint32_t                                     MagModeFilter;
+#define     MAPFILTER_NEAREST                                  0
+#define     MAPFILTER_LINEAR                                   1
+#define     MAPFILTER_ANISOTROPIC                              2
+#define     MAPFILTER_MONO                                     6
+   uint32_t                                     MinModeFilter;
+   float                                        TextureLODBias;
+#define     LEGACY                                             0
+#define     EWAApproximation                                   1
+   uint32_t                                     AnisotropicAlgorithm;
+   float                                        MinLOD;
+   float                                        MaxLOD;
+   bool                                         ChromaKeyEnable;
+   uint32_t                                     ChromaKeyIndex;
+#define     KEYFILTER_KILL_ON_ANY_MATCH                        0
+#define     KEYFILTER_REPLACE_BLACK                            1
+   uint32_t                                     ChromaKeyMode;
+#define     PREFILTEROPALWAYS                                  0
+#define     PREFILTEROPNEVER                                   1
+#define     PREFILTEROPLESS                                    2
+#define     PREFILTEROPEQUAL                                   3
+#define     PREFILTEROPLEQUAL                                  4
+#define     PREFILTEROPGREATER                                 5
+#define     PREFILTEROPNOTEQUAL                                6
+#define     PREFILTEROPGEQUAL                                  7
+   uint32_t                                     ShadowFunction;
+#define     PROGRAMMED                                         0
+#define     OVERRIDE                                           1
+   uint32_t                                     CubeSurfaceControlMode;
+   uint32_t                                     IndirectStatePointer;
+#define     MIPNONE                                            0
+#define     MIPFILTER                                          1
+   uint32_t                                     LODClampMagnificationMode;
+#define     STD_FILTER                                         0
+#define     COMPARISON                                         1
+#define     MINIMUM                                            2
+#define     MAXIMUM                                            3
+   uint32_t                                     ReductionType;
+#define     RATIO21                                            0
+#define     RATIO41                                            1
+#define     RATIO61                                            2
+#define     RATIO81                                            3
+#define     RATIO101                                           4
+#define     RATIO121                                           5
+#define     RATIO141                                           6
+#define     RATIO161                                           7
+   uint32_t                                     MaximumAnisotropy;
+   bool                                         RAddressMinFilterRoundingEnable;
+   bool                                         RAddressMagFilterRoundingEnable;
+   bool                                         VAddressMinFilterRoundingEnable;
+   bool                                         VAddressMagFilterRoundingEnable;
+   bool                                         UAddressMinFilterRoundingEnable;
+   bool                                         UAddressMagFilterRoundingEnable;
+#define     FULL                                               0
+#define     HIGH                                               1
+#define     MED                                                2
+#define     LOW                                                3
+   uint32_t                                     TrilinearFilterQuality;
+   bool                                         NonnormalizedCoordinateEnable;
+   bool                                         ReductionTypeEnable;
+   uint32_t                                     TCXAddressControlMode;
+   uint32_t                                     TCYAddressControlMode;
+   uint32_t                                     TCZAddressControlMode;
+};
+
+static inline void
+GEN9_SAMPLER_STATE_pack(__gen_user_data *data, void * restrict dst,
+                        const struct GEN9_SAMPLER_STATE * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->SamplerDisable, 31, 31) |
+      __gen_field(values->TextureBorderColorMode, 29, 29) |
+      __gen_field(values->LODPreClampMode, 27, 28) |
+      __gen_field(values->CoarseLODQualityMode, 22, 26) |
+      __gen_field(values->MipModeFilter, 20, 21) |
+      __gen_field(values->MagModeFilter, 17, 19) |
+      __gen_field(values->MinModeFilter, 14, 16) |
+      __gen_fixed(values->TextureLODBias, 1, 13, true, 8) |
+      __gen_field(values->AnisotropicAlgorithm, 0, 0) |
+      0;
+
+   dw[1] =
+      __gen_field(values->MinLOD * (1 << 8), 20, 31) |
+      __gen_field(values->MaxLOD * (1 << 8), 8, 19) |
+      __gen_field(values->ChromaKeyEnable, 7, 7) |
+      __gen_field(values->ChromaKeyIndex, 5, 6) |
+      __gen_field(values->ChromaKeyMode, 4, 4) |
+      __gen_field(values->ShadowFunction, 1, 3) |
+      __gen_field(values->CubeSurfaceControlMode, 0, 0) |
+      0;
+
+   dw[2] =
+      __gen_field(values->IndirectStatePointer, 6, 23) |
+      __gen_field(values->LODClampMagnificationMode, 0, 0) |
+      0;
+
+   dw[3] =
+      __gen_field(values->ReductionType, 22, 23) |
+      __gen_field(values->MaximumAnisotropy, 19, 21) |
+      __gen_field(values->RAddressMinFilterRoundingEnable, 13, 13) |
+      __gen_field(values->RAddressMagFilterRoundingEnable, 14, 14) |
+      __gen_field(values->VAddressMinFilterRoundingEnable, 15, 15) |
+      __gen_field(values->VAddressMagFilterRoundingEnable, 16, 16) |
+      __gen_field(values->UAddressMinFilterRoundingEnable, 17, 17) |
+      __gen_field(values->UAddressMagFilterRoundingEnable, 18, 18) |
+      __gen_field(values->TrilinearFilterQuality, 11, 12) |
+      __gen_field(values->NonnormalizedCoordinateEnable, 10, 10) |
+      __gen_field(values->ReductionTypeEnable, 9, 9) |
+      __gen_field(values->TCXAddressControlMode, 6, 8) |
+      __gen_field(values->TCYAddressControlMode, 3, 5) |
+      __gen_field(values->TCZAddressControlMode, 0, 2) |
+      0;
+
+}
+
+#define GEN9_SAMPLER_STATE_8X8_AVS_COEFFICIENTS_length 0x00000008
+
+struct GEN9_SAMPLER_STATE_8X8_AVS_COEFFICIENTS {
+   uint32_t                                     Table0YFilterCoefficientn1;
+   uint32_t                                     Table0XFilterCoefficientn1;
+   uint32_t                                     Table0YFilterCoefficientn0;
+   uint32_t                                     Table0XFilterCoefficientn0;
+   uint32_t                                     Table0YFilterCoefficientn3;
+   uint32_t                                     Table0XFilterCoefficientn3;
+   uint32_t                                     Table0YFilterCoefficientn2;
+   uint32_t                                     Table0XFilterCoefficientn2;
+   uint32_t                                     Table0YFilterCoefficientn5;
+   uint32_t                                     Table0XFilterCoefficientn5;
+   uint32_t                                     Table0YFilterCoefficientn4;
+   uint32_t                                     Table0XFilterCoefficientn4;
+   uint32_t                                     Table0YFilterCoefficientn7;
+   uint32_t                                     Table0XFilterCoefficientn7;
+   uint32_t                                     Table0YFilterCoefficientn6;
+   uint32_t                                     Table0XFilterCoefficientn6;
+   uint32_t                                     Table1XFilterCoefficientn3;
+   uint32_t                                     Table1XFilterCoefficientn2;
+   uint32_t                                     Table1XFilterCoefficientn5;
+   uint32_t                                     Table1XFilterCoefficientn4;
+   uint32_t                                     Table1YFilterCoefficientn3;
+   uint32_t                                     Table1YFilterCoefficientn2;
+   uint32_t                                     Table1YFilterCoefficientn5;
+   uint32_t                                     Table1YFilterCoefficientn4;
+};
+
+static inline void
+GEN9_SAMPLER_STATE_8X8_AVS_COEFFICIENTS_pack(__gen_user_data *data, void * restrict dst,
+                                             const struct GEN9_SAMPLER_STATE_8X8_AVS_COEFFICIENTS * restrict values)
+{
+   uint32_t *dw = (uint32_t * restrict) dst;
+
+   dw[0] =
+      __gen_field(values->Table0YFilterCoefficientn1, 24, 31) |
+      __gen_field(values->Table0XFilterCoefficientn1, 16, 23) |
+      __gen_field(values->Table0YFilterCoefficientn0, 8, 15) |
+      __gen_field(values->Table0XFilterCoefficientn0, 0, 7) |
+      0;
+
+   dw[1] =
+      __gen_field(values->Table0YFilterCoefficientn3, 24, 31) |
+      __gen_field(values->Table0XFilterCoefficientn3, 16, 23) |
+      __gen_field(values->Table0YFilterCoefficientn2, 8, 15) |
+      __gen_field(values->Table0XFilterCoefficientn2, 0, 7) |
+      0;
+
+   dw[2] =
+      __gen_field(values->Table0YFilterCoefficientn5, 24, 31) |
+      __gen_field(values->Table0XFilterCoefficientn5, 16, 23) |
+      __gen_field(values->Table0YFilterCoefficientn4, 8, 15) |
+      __gen_field(values->Table0XFilterCoefficientn4, 0, 7) |
+      0;
+
+   dw[3] =
+      __gen_field(values->Table0YFilterCoefficientn7, 24, 31) |
+      __gen_field(values->Table0XFilterCoefficientn7, 16, 23) |
+      __gen_field(values->Table0YFilterCoefficientn6, 8, 15) |
+      __gen_field(values->Table0XFilterCoefficientn6, 0, 7) |
+      0;
+
+   dw[4] =
+      __gen_field(values->Table1XFilterCoefficientn3, 24, 31) |
+      __gen_field(values->Table1XFilterCoefficientn2, 16, 23) |
+      0;
+
+   dw[5] =
+      __gen_field(values->Table1XFilterCoefficientn5, 8, 15) |
+      __gen_field(values->Table1XFilterCoefficientn4, 0, 7) |
+      0;
+
+   dw[6] =
+      __gen_field(values->Table1YFilterCoefficientn3, 24, 31) |
+      __gen_field(values->Table1YFilterCoefficientn2, 16, 23) |
+      0;
+
+   dw[7] =
+      __gen_field(values->Table1YFilterCoefficientn5, 8, 15) |
+      __gen_field(values->Table1YFilterCoefficientn4, 0, 7) |
+      0;
+
+}
+
+/* Enum 3D_Prim_Topo_Type */
+#define     _3DPRIM_POINTLIST                                  1
+#define     _3DPRIM_LINELIST                                   2
+#define     _3DPRIM_LINESTRIP                                  3
+#define     _3DPRIM_TRILIST                                    4
+#define     _3DPRIM_TRISTRIP                                   5
+#define     _3DPRIM_TRIFAN                                     6
+#define     _3DPRIM_QUADLIST                                   7
+#define     _3DPRIM_QUADSTRIP                                  8
+#define     _3DPRIM_LINELIST_ADJ                               9
+#define     _3DPRIM_LINESTRIP_ADJ                             10
+#define     _3DPRIM_TRILIST_ADJ                               11
+#define     _3DPRIM_TRISTRIP_ADJ                              12
+#define     _3DPRIM_TRISTRIP_REVERSE                          13
+#define     _3DPRIM_POLYGON                                   14
+#define     _3DPRIM_RECTLIST                                  15
+#define     _3DPRIM_LINELOOP                                  16
+#define     _3DPRIM_POINTLIST_BF                              17
+#define     _3DPRIM_LINESTRIP_CONT                            18
+#define     _3DPRIM_LINESTRIP_BF                              19
+#define     _3DPRIM_LINESTRIP_CONT_BF                         20
+#define     _3DPRIM_TRIFAN_NOSTIPPLE                          22
+#define     _3DPRIM_PATCHLIST_1                               32
+#define     _3DPRIM_PATCHLIST_2                               33
+#define     _3DPRIM_PATCHLIST_3                               34
+#define     _3DPRIM_PATCHLIST_4                               35
+#define     _3DPRIM_PATCHLIST_5                               36
+#define     _3DPRIM_PATCHLIST_6                               37
+#define     _3DPRIM_PATCHLIST_7                               38
+#define     _3DPRIM_PATCHLIST_8                               39
+#define     _3DPRIM_PATCHLIST_9                               40
+#define     _3DPRIM_PATCHLIST_10                              41
+#define     _3DPRIM_PATCHLIST_11                              42
+#define     _3DPRIM_PATCHLIST_12                              43
+#define     _3DPRIM_PATCHLIST_13                              44
+#define     _3DPRIM_PATCHLIST_14                              45
+#define     _3DPRIM_PATCHLIST_15                              46
+#define     _3DPRIM_PATCHLIST_16                              47
+#define     _3DPRIM_PATCHLIST_17                              48
+#define     _3DPRIM_PATCHLIST_18                              49
+#define     _3DPRIM_PATCHLIST_19                              50
+#define     _3DPRIM_PATCHLIST_20                              51
+#define     _3DPRIM_PATCHLIST_21                              52
+#define     _3DPRIM_PATCHLIST_22                              53
+#define     _3DPRIM_PATCHLIST_23                              54
+#define     _3DPRIM_PATCHLIST_24                              55
+#define     _3DPRIM_PATCHLIST_25                              56
+#define     _3DPRIM_PATCHLIST_26                              57
+#define     _3DPRIM_PATCHLIST_27                              58
+#define     _3DPRIM_PATCHLIST_28                              59
+#define     _3DPRIM_PATCHLIST_29                              60
+#define     _3DPRIM_PATCHLIST_30                              61
+#define     _3DPRIM_PATCHLIST_31                              62
+#define     _3DPRIM_PATCHLIST_32                              63
+
+/* Enum 3D_Vertex_Component_Control */
+#define     VFCOMP_NOSTORE                                     0
+#define     VFCOMP_STORE_SRC                                   1
+#define     VFCOMP_STORE_0                                     2
+#define     VFCOMP_STORE_1_FP                                  3
+#define     VFCOMP_STORE_1_INT                                 4
+#define     VFCOMP_STORE_PID                                   7
+
+/* Enum COMPONENT_ENABLES */
+#define     CE_NONE                                            0
+#define     CE_X                                               1
+#define     CE_Y                                               2
+#define     CE_XY                                              3
+#define     CE_Z                                               4
+#define     CE_XZ                                              5
+#define     CE_YZ                                              6
+#define     CE_XYZ                                             7
+#define     CE_W                                               8
+#define     CE_XW                                              9
+#define     CE_YW                                             10
+#define     CE_XYW                                            11
+#define     CE_ZW                                             12
+#define     CE_XZW                                            13
+#define     CE_YZW                                            14
+#define     CE_XYZW                                           15
+
+/* Enum Attribute_Component_Format */
+#define     ACF_DISABLED                                       0
+#define     ACF_XY                                             1
+#define     ACF_XYZ                                            2
+#define     ACF_XYZW                                           3
+
+/* Enum WRAP_SHORTEST_ENABLE */
+#define     WSE_X                                              1
+#define     WSE_Y                                              2
+#define     WSE_XY                                             3
+#define     WSE_Z                                              4
+#define     WSE_XZ                                             5
+#define     WSE_YZ                                             6
+#define     WSE_XYZ                                            7
+#define     WSE_W                                              8
+#define     WSE_XW                                             9
+#define     WSE_YW                                            10
+#define     WSE_XYW                                           11
+#define     WSE_ZW                                            12
+#define     WSE_XZW                                           13
+#define     WSE_YZW                                           14
+#define     WSE_XYZW                                          15
+
+/* Enum 3D_Stencil_Operation */
+#define     STENCILOP_KEEP                                     0
+#define     STENCILOP_ZERO                                     1
+#define     STENCILOP_REPLACE                                  2
+#define     STENCILOP_INCRSAT                                  3
+#define     STENCILOP_DECRSAT                                  4
+#define     STENCILOP_INCR                                     5
+#define     STENCILOP_DECR                                     6
+#define     STENCILOP_INVERT                                   7
+
+/* Enum 3D_Color_Buffer_Blend_Factor */
+#define     BLENDFACTOR_ONE                                    1
+#define     BLENDFACTOR_SRC_COLOR                              2
+#define     BLENDFACTOR_SRC_ALPHA                              3
+#define     BLENDFACTOR_DST_ALPHA                              4
+#define     BLENDFACTOR_DST_COLOR                              5
+#define     BLENDFACTOR_SRC_ALPHA_SATURATE                     6
+#define     BLENDFACTOR_CONST_COLOR                            7
+#define     BLENDFACTOR_CONST_ALPHA                            8
+#define     BLENDFACTOR_SRC1_COLOR                             9
+#define     BLENDFACTOR_SRC1_ALPHA                            10
+#define     BLENDFACTOR_ZERO                                  17
+#define     BLENDFACTOR_INV_SRC_COLOR                         18
+#define     BLENDFACTOR_INV_SRC_ALPHA                         19
+#define     BLENDFACTOR_INV_DST_ALPHA                         20
+#define     BLENDFACTOR_INV_DST_COLOR                         21
+#define     BLENDFACTOR_INV_CONST_COLOR                       23
+#define     BLENDFACTOR_INV_CONST_ALPHA                       24
+#define     BLENDFACTOR_INV_SRC1_COLOR                        25
+#define     BLENDFACTOR_INV_SRC1_ALPHA                        26
+
+/* Enum 3D_Color_Buffer_Blend_Function */
+#define     BLENDFUNCTION_ADD                                  0
+#define     BLENDFUNCTION_SUBTRACT                             1
+#define     BLENDFUNCTION_REVERSE_SUBTRACT                     2
+#define     BLENDFUNCTION_MIN                                  3
+#define     BLENDFUNCTION_MAX                                  4
+
+/* Enum 3D_Compare_Function */
+#define     COMPAREFUNCTION_ALWAYS                             0
+#define     COMPAREFUNCTION_NEVER                              1
+#define     COMPAREFUNCTION_LESS                               2
+#define     COMPAREFUNCTION_EQUAL                              3
+#define     COMPAREFUNCTION_LEQUAL                             4
+#define     COMPAREFUNCTION_GREATER                            5
+#define     COMPAREFUNCTION_NOTEQUAL                           6
+#define     COMPAREFUNCTION_GEQUAL                             7
+
+/* Enum 3D_Logic_Op_Function */
+#define     LOGICOP_CLEAR                                      0
+#define     LOGICOP_NOR                                        1
+#define     LOGICOP_AND_INVERTED                               2
+#define     LOGICOP_COPY_INVERTED                              3
+#define     LOGICOP_AND_REVERSE                                4
+#define     LOGICOP_INVERT                                     5
+#define     LOGICOP_XOR                                        6
+#define     LOGICOP_NAND                                       7
+#define     LOGICOP_AND                                        8
+#define     LOGICOP_EQUIV                                      9
+#define     LOGICOP_NOOP                                      10
+#define     LOGICOP_OR_INVERTED                               11
+#define     LOGICOP_COPY                                      12
+#define     LOGICOP_OR_REVERSE                                13
+#define     LOGICOP_OR                                        14
+#define     LOGICOP_SET                                       15
+
+/* Enum SURFACE_FORMAT */
+#define     R32G32B32A32_FLOAT                                 0
+#define     R32G32B32A32_SINT                                  1
+#define     R32G32B32A32_UINT                                  2
+#define     R32G32B32A32_UNORM                                 3
+#define     R32G32B32A32_SNORM                                 4
+#define     R64G64_FLOAT                                       5
+#define     R32G32B32X32_FLOAT                                 6
+#define     R32G32B32A32_SSCALED                               7
+#define     R32G32B32A32_USCALED                               8
+#define     R32G32B32A32_SFIXED                               32
+#define     R64G64_PASSTHRU                                   33
+#define     R32G32B32_FLOAT                                   64
+#define     R32G32B32_SINT                                    65
+#define     R32G32B32_UINT                                    66
+#define     R32G32B32_UNORM                                   67
+#define     R32G32B32_SNORM                                   68
+#define     R32G32B32_SSCALED                                 69
+#define     R32G32B32_USCALED                                 70
+#define     R32G32B32_SFIXED                                  80
+#define     R16G16B16A16_UNORM                               128
+#define     R16G16B16A16_SNORM                               129
+#define     R16G16B16A16_SINT                                130
+#define     R16G16B16A16_UINT                                131
+#define     R16G16B16A16_FLOAT                               132
+#define     R32G32_FLOAT                                     133
+#define     R32G32_SINT                                      134
+#define     R32G32_UINT                                      135
+#define     R32_FLOAT_X8X24_TYPELESS                         136
+#define     X32_TYPELESS_G8X24_UINT                          137
+#define     L32A32_FLOAT                                     138
+#define     R32G32_UNORM                                     139
+#define     R32G32_SNORM                                     140
+#define     R64_FLOAT                                        141
+#define     R16G16B16X16_UNORM                               142
+#define     R16G16B16X16_FLOAT                               143
+#define     A32X32_FLOAT                                     144
+#define     L32X32_FLOAT                                     145
+#define     I32X32_FLOAT                                     146
+#define     R16G16B16A16_SSCALED                             147
+#define     R16G16B16A16_USCALED                             148
+#define     R32G32_SSCALED                                   149
+#define     R32G32_USCALED                                   150
+#define     R32G32_SFIXED                                    160
+#define     R64_PASSTHRU                                     161
+#define     B8G8R8A8_UNORM                                   192
+#define     B8G8R8A8_UNORM_SRGB                              193
+#define     R10G10B10A2_UNORM                                194
+#define     R10G10B10A2_UNORM_SRGB                           195
+#define     R10G10B10A2_UINT                                 196
+#define     R10G10B10_SNORM_A2_UNORM                         197
+#define     R8G8B8A8_UNORM                                   199
+#define     R8G8B8A8_UNORM_SRGB                              200
+#define     R8G8B8A8_SNORM                                   201
+#define     R8G8B8A8_SINT                                    202
+#define     R8G8B8A8_UINT                                    203
+#define     R16G16_UNORM                                     204
+#define     R16G16_SNORM                                     205
+#define     R16G16_SINT                                      206
+#define     R16G16_UINT                                      207
+#define     R16G16_FLOAT                                     208
+#define     B10G10R10A2_UNORM                                209
+#define     B10G10R10A2_UNORM_SRGB                           210
+#define     R11G11B10_FLOAT                                  211
+#define     R32_SINT                                         214
+#define     R32_UINT                                         215
+#define     R32_FLOAT                                        216
+#define     R24_UNORM_X8_TYPELESS                            217
+#define     X24_TYPELESS_G8_UINT                             218
+#define     L32_UNORM                                        221
+#define     A32_UNORM                                        222
+#define     L16A16_UNORM                                     223
+#define     I24X8_UNORM                                      224
+#define     L24X8_UNORM                                      225
+#define     A24X8_UNORM                                      226
+#define     I32_FLOAT                                        227
+#define     L32_FLOAT                                        228
+#define     A32_FLOAT                                        229
+#define     X8B8_UNORM_G8R8_SNORM                            230
+#define     A8X8_UNORM_G8R8_SNORM                            231
+#define     B8X8_UNORM_G8R8_SNORM                            232
+#define     B8G8R8X8_UNORM                                   233
+#define     B8G8R8X8_UNORM_SRGB                              234
+#define     R8G8B8X8_UNORM                                   235
+#define     R8G8B8X8_UNORM_SRGB                              236
+#define     R9G9B9E5_SHAREDEXP                               237
+#define     B10G10R10X2_UNORM                                238
+#define     L16A16_FLOAT                                     240
+#define     R32_UNORM                                        241
+#define     R32_SNORM                                        242
+#define     R10G10B10X2_USCALED                              243
+#define     R8G8B8A8_SSCALED                                 244
+#define     R8G8B8A8_USCALED                                 245
+#define     R16G16_SSCALED                                   246
+#define     R16G16_USCALED                                   247
+#define     R32_SSCALED                                      248
+#define     R32_USCALED                                      249
+#define     B5G6R5_UNORM                                     256
+#define     B5G6R5_UNORM_SRGB                                257
+#define     B5G5R5A1_UNORM                                   258
+#define     B5G5R5A1_UNORM_SRGB                              259
+#define     B4G4R4A4_UNORM                                   260
+#define     B4G4R4A4_UNORM_SRGB                              261
+#define     R8G8_UNORM                                       262
+#define     R8G8_SNORM                                       263
+#define     R8G8_SINT                                        264
+#define     R8G8_UINT                                        265
+#define     R16_UNORM                                        266
+#define     R16_SNORM                                        267
+#define     R16_SINT                                         268
+#define     R16_UINT                                         269
+#define     R16_FLOAT                                        270
+#define     A8P8_UNORM_PALETTE0                              271
+#define     A8P8_UNORM_PALETTE1                              272
+#define     I16_UNORM                                        273
+#define     L16_UNORM                                        274
+#define     A16_UNORM                                        275
+#define     L8A8_UNORM                                       276
+#define     I16_FLOAT                                        277
+#define     L16_FLOAT                                        278
+#define     A16_FLOAT                                        279
+#define     L8A8_UNORM_SRGB                                  280
+#define     R5G5_SNORM_B6_UNORM                              281
+#define     B5G5R5X1_UNORM                                   282
+#define     B5G5R5X1_UNORM_SRGB                              283
+#define     R8G8_SSCALED                                     284
+#define     R8G8_USCALED                                     285
+#define     R16_SSCALED                                      286
+#define     R16_USCALED                                      287
+#define     P8A8_UNORM_PALETTE0                              290
+#define     P8A8_UNORM_PALETTE1                              291
+#define     A1B5G5R5_UNORM                                   292
+#define     A4B4G4R4_UNORM                                   293
+#define     L8A8_UINT                                        294
+#define     L8A8_SINT                                        295
+#define     R8_UNORM                                         320
+#define     R8_SNORM                                         321
+#define     R8_SINT                                          322
+#define     R8_UINT                                          323
+#define     A8_UNORM                                         324
+#define     I8_UNORM                                         325
+#define     L8_UNORM                                         326
+#define     P4A4_UNORM_PALETTE0                              327
+#define     A4P4_UNORM_PALETTE0                              328
+#define     R8_SSCALED                                       329
+#define     R8_USCALED                                       330
+#define     P8_UNORM_PALETTE0                                331
+#define     L8_UNORM_SRGB                                    332
+#define     P8_UNORM_PALETTE1                                333
+#define     P4A4_UNORM_PALETTE1                              334
+#define     A4P4_UNORM_PALETTE1                              335
+#define     Y8_UNORM                                         336
+#define     L8_UINT                                          338
+#define     L8_SINT                                          339
+#define     I8_UINT                                          340
+#define     I8_SINT                                          341
+#define     DXT1_RGB_SRGB                                    384
+#define     R1_UNORM                                         385
+#define     YCRCB_NORMAL                                     386
+#define     YCRCB_SWAPUVY                                    387
+#define     P2_UNORM_PALETTE0                                388
+#define     P2_UNORM_PALETTE1                                389
+#define     BC1_UNORM                                        390
+#define     BC2_UNORM                                        391
+#define     BC3_UNORM                                        392
+#define     BC4_UNORM                                        393
+#define     BC5_UNORM                                        394
+#define     BC1_UNORM_SRGB                                   395
+#define     BC2_UNORM_SRGB                                   396
+#define     BC3_UNORM_SRGB                                   397
+#define     MONO8                                            398
+#define     YCRCB_SWAPUV                                     399
+#define     YCRCB_SWAPY                                      400
+#define     DXT1_RGB                                         401
+#define     FXT1                                             402
+#define     R8G8B8_UNORM                                     403
+#define     R8G8B8_SNORM                                     404
+#define     R8G8B8_SSCALED                                   405
+#define     R8G8B8_USCALED                                   406
+#define     R64G64B64A64_FLOAT                               407
+#define     R64G64B64_FLOAT                                  408
+#define     BC4_SNORM                                        409
+#define     BC5_SNORM                                        410
+#define     R16G16B16_FLOAT                                  411
+#define     R16G16B16_UNORM                                  412
+#define     R16G16B16_SNORM                                  413
+#define     R16G16B16_SSCALED                                414
+#define     R16G16B16_USCALED                                415
+#define     BC6H_SF16                                        417
+#define     BC7_UNORM                                        418
+#define     BC7_UNORM_SRGB                                   419
+#define     BC6H_UF16                                        420
+#define     PLANAR_420_8                                     421
+#define     R8G8B8_UNORM_SRGB                                424
+#define     ETC1_RGB8                                        425
+#define     ETC2_RGB8                                        426
+#define     EAC_R11                                          427
+#define     EAC_RG11                                         428
+#define     EAC_SIGNED_R11                                   429
+#define     EAC_SIGNED_RG11                                  430
+#define     ETC2_SRGB8                                       431
+#define     R16G16B16_UINT                                   432
+#define     R16G16B16_SINT                                   433
+#define     R32_SFIXED                                       434
+#define     R10G10B10A2_SNORM                                435
+#define     R10G10B10A2_USCALED                              436
+#define     R10G10B10A2_SSCALED                              437
+#define     R10G10B10A2_SINT                                 438
+#define     B10G10R10A2_SNORM                                439
+#define     B10G10R10A2_USCALED                              440
+#define     B10G10R10A2_SSCALED                              441
+#define     B10G10R10A2_UINT                                 442
+#define     B10G10R10A2_SINT                                 443
+#define     R64G64B64A64_PASSTHRU                            444
+#define     R64G64B64_PASSTHRU                               445
+#define     ETC2_RGB8_PTA                                    448
+#define     ETC2_SRGB8_PTA                                   449
+#define     ETC2_EAC_RGBA8                                   450
+#define     ETC2_EAC_SRGB8_A8                                451
+#define     R8G8B8_UINT                                      456
+#define     R8G8B8_SINT                                      457
+#define     RAW                                              511
+
+/* Enum Shader Channel Select */
+#define     SCS_ZERO                                           0
+#define     SCS_ONE                                            1
+#define     SCS_RED                                            4
+#define     SCS_GREEN                                          5
+#define     SCS_BLUE                                           6
+#define     SCS_ALPHA                                          7
+
+/* Enum Texture Coordinate Mode */
+#define     TCM_WRAP                                           0
+#define     TCM_MIRROR                                         1
+#define     TCM_CLAMP                                          2
+#define     TCM_CUBE                                           3
+#define     TCM_CLAMP_BORDER                                   4
+#define     TCM_MIRROR_ONCE                                    5
+#define     TCM_HALF_BORDER                                    6
+
diff --git a/src/vulkan/genX_cmd_buffer.c b/src/vulkan/genX_cmd_buffer.c

new file mode 100644 (file)

index 0000000..4f7054f
--- /dev/null
+++ b/src/vulkan/genX_cmd_buffer.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+
+#if (ANV_GEN == 9)
+#  include "gen9_pack.h"
+#elif (ANV_GEN == 8)
+#  include "gen8_pack.h"
+#elif (ANV_IS_HASWELL)
+#  include "gen75_pack.h"
+#elif (ANV_GEN == 7)
+#  include "gen7_pack.h"
+#endif
+
+void
+genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_bo *scratch_bo = NULL;
+
+   cmd_buffer->state.scratch_size =
+      anv_block_pool_size(&device->scratch_block_pool);
+   if (cmd_buffer->state.scratch_size > 0)
+      scratch_bo = &device->scratch_block_pool.bo;
+
+/* XXX: Do we need this on more than just BDW? */
+#if (ANV_GEN == 8)
+   /* Emit a render target cache flush.
+    *
+    * This isn't documented anywhere in the PRM.  However, it seems to be
+    * necessary prior to changing the surface state base adress.  Without
+    * this, we get GPU hangs when using multi-level command buffers which
+    * clear depth, reset state base address, and then go render stuff.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
+                  .RenderTargetCacheFlushEnable = true);
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS),
+      .GeneralStateBaseAddress = { scratch_bo, 0 },
+      .GeneralStateMemoryObjectControlState = GENX(MOCS),
+      .GeneralStateBaseAddressModifyEnable = true,
+
+      .SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer),
+      .SurfaceStateMemoryObjectControlState = GENX(MOCS),
+      .SurfaceStateBaseAddressModifyEnable = true,
+
+      .DynamicStateBaseAddress = { &device->dynamic_state_block_pool.bo, 0 },
+      .DynamicStateMemoryObjectControlState = GENX(MOCS),
+      .DynamicStateBaseAddressModifyEnable = true,
+
+      .IndirectObjectBaseAddress = { NULL, 0 },
+      .IndirectObjectMemoryObjectControlState = GENX(MOCS),
+      .IndirectObjectBaseAddressModifyEnable = true,
+
+      .InstructionBaseAddress = { &device->instruction_block_pool.bo, 0 },
+      .InstructionMemoryObjectControlState = GENX(MOCS),
+      .InstructionBaseAddressModifyEnable = true,
+
+#  if (ANV_GEN >= 8)
+      /* Broadwell requires that we specify a buffer size for a bunch of
+       * these fields.  However, since we will be growing the BO's live, we
+       * just set them all to the maximum.
+       */
+      .GeneralStateBufferSize = 0xfffff,
+      .GeneralStateBufferSizeModifyEnable = true,
+      .DynamicStateBufferSize = 0xfffff,
+      .DynamicStateBufferSizeModifyEnable = true,
+      .IndirectObjectBufferSize = 0xfffff,
+      .IndirectObjectBufferSizeModifyEnable = true,
+      .InstructionBufferSize = 0xfffff,
+      .InstructionBuffersizeModifyEnable = true,
+#  endif
+   );
+
+   /* After re-setting the surface state base address, we have to do some
+    * cache flusing so that the sampler engine will pick up the new
+    * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
+    * Shared Function > 3D Sampler > State > State Caching (page 96):
+    *
+    *    Coherency with system memory in the state cache, like the texture
+    *    cache is handled partially by software. It is expected that the
+    *    command stream or shader will issue Cache Flush operation or
+    *    Cache_Flush sampler message to ensure that the L1 cache remains
+    *    coherent with system memory.
+    *
+    *    [...]
+    *
+    *    Whenever the value of the Dynamic_State_Base_Addr,
+    *    Surface_State_Base_Addr are altered, the L1 state cache must be
+    *    invalidated to ensure the new surface or sampler state is fetched
+    *    from system memory.
+    *
+    * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
+    * which, according the PIPE_CONTROL instruction documentation in the
+    * Broadwell PRM:
+    *
+    *    Setting this bit is independent of any other bit in this packet.
+    *    This bit controls the invalidation of the L1 and L2 state caches
+    *    at the top of the pipe i.e. at the parsing time.
+    *
+    * Unfortunately, experimentation seems to indicate that state cache
+    * invalidation through a PIPE_CONTROL does nothing whatsoever in
+    * regards to surface state and binding tables.  In stead, it seems that
+    * invalidating the texture cache is what is actually needed.
+    *
+    * XXX:  As far as we have been able to determine through
+    * experimentation, shows that flush the texture cache appears to be
+    * sufficient.  The theory here is that all of the sampling/rendering
+    * units cache the binding table in the texture cache.  However, we have
+    * yet to be able to actually confirm this.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                  .TextureCacheInvalidationEnable = true);
+}
+
+void genX(CmdPipelineBarrier)(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlags                        srcStageMask,
+    VkPipelineStageFlags                        destStageMask,
+    VkBool32                                    byRegion,
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   uint32_t b, *dw;
+
+   struct GENX(PIPE_CONTROL) cmd = {
+      GENX(PIPE_CONTROL_header),
+      .PostSyncOperation = NoWrite,
+   };
+
+   /* XXX: I think waitEvent is a no-op on our HW.  We should verify that. */
+
+   if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
+      /* This is just what PIPE_CONTROL does */
+   }
+
+   if (anv_clear_mask(&srcStageMask,
+                      VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
+                      VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
+                      VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
+                      VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
+                      VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
+                      VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
+                      VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
+                      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
+                      VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
+                      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT)) {
+      cmd.StallAtPixelScoreboard = true;
+   }
+
+   if (anv_clear_mask(&srcStageMask,
+                      VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
+                      VK_PIPELINE_STAGE_TRANSFER_BIT)) {
+      cmd.CommandStreamerStallEnable = true;
+   }
+
+   if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_HOST_BIT)) {
+      anv_finishme("VK_PIPE_EVENT_CPU_SIGNAL_BIT");
+   }
+
+   /* On our hardware, all stages will wait for execution as needed. */
+   (void)destStageMask;
+
+   /* We checked all known VkPipeEventFlags. */
+   anv_assert(srcStageMask == 0);
+
+   /* XXX: Right now, we're really dumb and just flush whatever categories
+    * the app asks for.  One of these days we may make this a bit better
+    * but right now that's all the hardware allows for in most areas.
+    */
+   VkAccessFlags src_flags = 0;
+   VkAccessFlags dst_flags = 0;
+
+   for (uint32_t i = 0; i < memoryBarrierCount; i++) {
+      src_flags |= pMemoryBarriers[i].srcAccessMask;
+      dst_flags |= pMemoryBarriers[i].dstAccessMask;
+   }
+
+   for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
+      src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
+      dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
+   }
+
+   for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
+      src_flags |= pImageMemoryBarriers[i].srcAccessMask;
+      dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
+   }
+
+   /* The src flags represent how things were used previously.  This is
+    * what we use for doing flushes.
+    */
+   for_each_bit(b, src_flags) {
+      switch ((VkAccessFlagBits)(1 << b)) {
+      case VK_ACCESS_SHADER_WRITE_BIT:
+         cmd.DCFlushEnable = true;
+         break;
+      case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
+         cmd.RenderTargetCacheFlushEnable = true;
+         break;
+      case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
+         cmd.DepthCacheFlushEnable = true;
+         break;
+      case VK_ACCESS_TRANSFER_WRITE_BIT:
+         cmd.RenderTargetCacheFlushEnable = true;
+         cmd.DepthCacheFlushEnable = true;
+         break;
+      default:
+         /* Doesn't require a flush */
+         break;
+      }
+   }
+
+   /* The dst flags represent how things will be used in the fugure.  This
+    * is what we use for doing cache invalidations.
+    */
+   for_each_bit(b, dst_flags) {
+      switch ((VkAccessFlagBits)(1 << b)) {
+      case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
+      case VK_ACCESS_INDEX_READ_BIT:
+      case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
+         cmd.VFCacheInvalidationEnable = true;
+         break;
+      case VK_ACCESS_UNIFORM_READ_BIT:
+         cmd.ConstantCacheInvalidationEnable = true;
+         /* fallthrough */
+      case VK_ACCESS_SHADER_READ_BIT:
+         cmd.TextureCacheInvalidationEnable = true;
+         break;
+      case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
+         cmd.TextureCacheInvalidationEnable = true;
+         break;
+      case VK_ACCESS_TRANSFER_READ_BIT:
+         cmd.TextureCacheInvalidationEnable = true;
+         break;
+      case VK_ACCESS_MEMORY_READ_BIT:
+         break; /* XXX: What is this? */
+      default:
+         /* Doesn't require a flush */
+         break;
+      }
+   }
+
+   dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
+   GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &cmd);
+}
diff --git a/src/vulkan/genX_pipeline_util.h b/src/vulkan/genX_pipeline_util.h

new file mode 100644 (file)

index 0000000..08fe6aa
--- /dev/null
+++ b/src/vulkan/genX_pipeline_util.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+static uint32_t
+vertex_element_comp_control(enum isl_format format, unsigned comp)
+{
+   uint8_t bits;
+   switch (comp) {
+   case 0: bits = isl_format_layouts[format].channels.r.bits; break;
+   case 1: bits = isl_format_layouts[format].channels.g.bits; break;
+   case 2: bits = isl_format_layouts[format].channels.b.bits; break;
+   case 3: bits = isl_format_layouts[format].channels.a.bits; break;
+   default: unreachable("Invalid component");
+   }
+
+   if (bits) {
+      return VFCOMP_STORE_SRC;
+   } else if (comp < 3) {
+      return VFCOMP_STORE_0;
+   } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
+            isl_format_layouts[format].channels.r.type == ISL_SINT) {
+      assert(comp == 3);
+      return VFCOMP_STORE_1_INT;
+   } else {
+      assert(comp == 3);
+      return VFCOMP_STORE_1_FP;
+   }
+}
+
+static const uint32_t vk_to_gen_cullmode[] = {
+   [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
+   [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
+   [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
+   [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
+};
+
+static const uint32_t vk_to_gen_fillmode[] = {
+   [VK_POLYGON_MODE_FILL]                    = RASTER_SOLID,
+   [VK_POLYGON_MODE_LINE]                    = RASTER_WIREFRAME,
+   [VK_POLYGON_MODE_POINT]                   = RASTER_POINT,
+};
+
+static const uint32_t vk_to_gen_front_face[] = {
+   [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
+   [VK_FRONT_FACE_CLOCKWISE]                 = 0
+};
+
+static const uint32_t vk_to_gen_logic_op[] = {
+   [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
+   [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
+   [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
+   [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
+   [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
+   [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
+   [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
+   [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
+   [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
+   [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
+   [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
+   [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
+   [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
+   [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
+   [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
+   [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
+};
+
+static const uint32_t vk_to_gen_blend[] = {
+   [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
+   [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
+   [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
+   [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
+   [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
+   [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
+   [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
+   [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
+   [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
+   [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
+   [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
+};
+
+static const uint32_t vk_to_gen_blend_op[] = {
+   [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
+   [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
+   [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
+   [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
+   [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
+};
+
+static const uint32_t vk_to_gen_compare_op[] = {
+   [VK_COMPARE_OP_NEVER]                        = PREFILTEROPNEVER,
+   [VK_COMPARE_OP_LESS]                         = PREFILTEROPLESS,
+   [VK_COMPARE_OP_EQUAL]                        = PREFILTEROPEQUAL,
+   [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROPLEQUAL,
+   [VK_COMPARE_OP_GREATER]                      = PREFILTEROPGREATER,
+   [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROPNOTEQUAL,
+   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROPGEQUAL,
+   [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROPALWAYS,
+};
+
+static const uint32_t vk_to_gen_stencil_op[] = {
+   [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
+   [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
+   [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
+   [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
+   [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
+   [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
+   [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
+   [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
+};
diff --git a/src/vulkan/genX_state_util.h b/src/vulkan/genX_state_util.h

new file mode 100644 (file)

index 0000000..215e9ba
--- /dev/null
+++ b/src/vulkan/genX_state_util.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+static const uint8_t
+anv_surftype(const struct anv_image *image, VkImageViewType view_type,
+             bool storage)
+{
+   switch (view_type) {
+   default:
+      unreachable("bad VkImageViewType");
+   case VK_IMAGE_VIEW_TYPE_1D:
+   case VK_IMAGE_VIEW_TYPE_1D_ARRAY:
+      assert(image->type == VK_IMAGE_TYPE_1D);
+      return SURFTYPE_1D;
+   case VK_IMAGE_VIEW_TYPE_CUBE:
+   case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY:
+      assert(image->type == VK_IMAGE_TYPE_2D);
+      return storage ? SURFTYPE_2D : SURFTYPE_CUBE;
+   case VK_IMAGE_VIEW_TYPE_2D:
+   case VK_IMAGE_VIEW_TYPE_2D_ARRAY:
+      assert(image->type == VK_IMAGE_TYPE_2D);
+      return SURFTYPE_2D;
+   case VK_IMAGE_VIEW_TYPE_3D:
+      assert(image->type == VK_IMAGE_TYPE_3D);
+      return SURFTYPE_3D;
+   }
+}
+
+#if ANV_GEN > 7 || ANV_IS_HASWELL
+static const uint32_t vk_to_gen_swizzle_map[] = {
+   [VK_COMPONENT_SWIZZLE_ZERO]                 = SCS_ZERO,
+   [VK_COMPONENT_SWIZZLE_ONE]                  = SCS_ONE,
+   [VK_COMPONENT_SWIZZLE_R]                    = SCS_RED,
+   [VK_COMPONENT_SWIZZLE_G]                    = SCS_GREEN,
+   [VK_COMPONENT_SWIZZLE_B]                    = SCS_BLUE,
+   [VK_COMPONENT_SWIZZLE_A]                    = SCS_ALPHA
+};
+
+static inline uint32_t
+vk_to_gen_swizzle(VkComponentSwizzle swizzle, VkComponentSwizzle component)
+{
+   if (swizzle == VK_COMPONENT_SWIZZLE_IDENTITY)
+      return vk_to_gen_swizzle_map[component];
+   else
+      return vk_to_gen_swizzle_map[swizzle];
+}
+#endif
+
+static inline uint32_t
+vk_to_gen_tex_filter(VkFilter filter, bool anisotropyEnable)
+{
+   switch (filter) {
+   default:
+      assert(!"Invalid filter");
+   case VK_FILTER_NEAREST:
+      return MAPFILTER_NEAREST;
+   case VK_FILTER_LINEAR:
+      return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
+   }
+}
+
+static inline uint32_t
+vk_to_gen_max_anisotropy(float ratio)
+{
+   return (anv_clamp_f(ratio, 2, 16) - 2) / 2;
+}
+
+static const uint32_t vk_to_gen_mipmap_mode[] = {
+   [VK_SAMPLER_MIPMAP_MODE_NEAREST]          = MIPFILTER_NEAREST,
+   [VK_SAMPLER_MIPMAP_MODE_LINEAR]           = MIPFILTER_LINEAR
+};
+
+static const uint32_t vk_to_gen_tex_address[] = {
+   [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = TCM_WRAP,
+   [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
+   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = TCM_CLAMP,
+   [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
+   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
+};
+
+static const uint32_t vk_to_gen_compare_op[] = {
+   [VK_COMPARE_OP_NEVER]                     = PREFILTEROPNEVER,
+   [VK_COMPARE_OP_LESS]                      = PREFILTEROPLESS,
+   [VK_COMPARE_OP_EQUAL]                     = PREFILTEROPEQUAL,
+   [VK_COMPARE_OP_LESS_OR_EQUAL]             = PREFILTEROPLEQUAL,
+   [VK_COMPARE_OP_GREATER]                   = PREFILTEROPGREATER,
+   [VK_COMPARE_OP_NOT_EQUAL]                 = PREFILTEROPNOTEQUAL,
+   [VK_COMPARE_OP_GREATER_OR_EQUAL]          = PREFILTEROPGEQUAL,
+   [VK_COMPARE_OP_ALWAYS]                    = PREFILTEROPALWAYS,
+};
diff --git a/src/vulkan/tests/.gitignore b/src/vulkan/tests/.gitignore

new file mode 100644 (file)

index 0000000..5d05405
--- /dev/null
+++ b/src/vulkan/tests/.gitignore
@@ -0,0 +1,5 @@
+block_pool
+block_pool_no_free
+state_pool
+state_pool_free_list_only
+state_pool_no_free
diff --git a/src/vulkan/tests/Makefile.am b/src/vulkan/tests/Makefile.am

new file mode 100644 (file)

index 0000000..883013d
--- /dev/null
+++ b/src/vulkan/tests/Makefile.am
@@ -0,0 +1,46 @@
+# Copyright © 2009 Intel Corporation
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  on the rights to use, copy, modify, merge, publish, distribute, sub
+#  license, and/or sell copies of the Software, and to permit persons to whom
+#  the Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice (including the next
+#  paragraph) shall be included in all copies or substantial portions of the
+#  Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+#  ADAM JACKSON BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+#  IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+#  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+AM_CPPFLAGS = \
+       $(INTEL_CFLAGS) \
+       $(VALGRIND_CFLAGS) \
+       $(DEFINES) \
+       -I$(top_srcdir)/include \
+       -I$(top_srcdir)/src \
+       -I$(top_srcdir)/src/mapi \
+       -I$(top_srcdir)/src/mesa \
+       -I$(top_srcdir)/src/mesa/drivers/dri/common \
+       -I$(top_srcdir)/src/mesa/drivers/dri/i965 \
+       -I$(top_srcdir)/src/gallium/auxiliary \
+       -I$(top_srcdir)/src/gallium/include \
+       -I$(top_srcdir)/src/isl/ \
+       -I$(top_srcdir)/src/vulkan
+
+LDADD = \
+       $(top_builddir)/src/vulkan/libvulkan-test.la \
+       $(PTHREAD_LIBS) -lm -lstdc++
+
+check_PROGRAMS = \
+       block_pool_no_free \
+       state_pool_no_free \
+       state_pool_free_list_only \
+       state_pool
+
+TESTS = $(check_PROGRAMS)
diff --git a/src/vulkan/tests/block_pool_no_free.c b/src/vulkan/tests/block_pool_no_free.c

new file mode 100644 (file)

index 0000000..86d1a76
--- /dev/null
+++ b/src/vulkan/tests/block_pool_no_free.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+
+#define NUM_THREADS 16
+#define BLOCKS_PER_THREAD 1024
+#define NUM_RUNS 64
+
+struct job {
+   pthread_t thread;
+   unsigned id;
+   struct anv_block_pool *pool;
+   uint32_t blocks[BLOCKS_PER_THREAD];
+   uint32_t back_blocks[BLOCKS_PER_THREAD];
+} jobs[NUM_THREADS];
+
+
+static void *alloc_blocks(void *_job)
+{
+   struct job *job = _job;
+   int32_t block, *data;
+
+   for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
+      block = anv_block_pool_alloc(job->pool);
+      data = job->pool->map + block;
+      *data = block;
+      assert(block >= 0);
+      job->blocks[i] = block;
+
+      block = anv_block_pool_alloc_back(job->pool);
+      data = job->pool->map + block;
+      *data = block;
+      assert(block < 0);
+      job->back_blocks[i] = -block;
+   }
+
+   for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
+      block = job->blocks[i];
+      data = job->pool->map + block;
+      assert(*data == block);
+
+      block = -job->back_blocks[i];
+      data = job->pool->map + block;
+      assert(*data == block);
+   }
+
+   return NULL;
+}
+
+static void validate_monotonic(uint32_t **blocks)
+{
+   /* A list of indices, one per thread */
+   unsigned next[NUM_THREADS];
+   memset(next, 0, sizeof(next));
+
+   int highest = -1;
+   while (true) {
+      /* First, we find which thread has the highest next element */
+      int thread_max = -1;
+      int max_thread_idx = -1;
+      for (unsigned i = 0; i < NUM_THREADS; i++) {
+         if (next[i] >= BLOCKS_PER_THREAD)
+            continue;
+
+         if (thread_max < blocks[i][next[i]]) {
+            thread_max = blocks[i][next[i]];
+            max_thread_idx = i;
+         }
+      }
+
+      /* The only way this can happen is if all of the next[] values are at
+       * BLOCKS_PER_THREAD, in which case, we're done.
+       */
+      if (thread_max == -1)
+         break;
+
+      /* That next element had better be higher than the previous highest */
+      assert(blocks[max_thread_idx][next[max_thread_idx]] > highest);
+
+      highest = blocks[max_thread_idx][next[max_thread_idx]];
+      next[max_thread_idx]++;
+   }
+}
+
+static void run_test()
+{
+   struct anv_device device;
+   struct anv_block_pool pool;
+
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_block_pool_init(&pool, &device, 16);
+
+   for (unsigned i = 0; i < NUM_THREADS; i++) {
+      jobs[i].pool = &pool;
+      jobs[i].id = i;
+      pthread_create(&jobs[i].thread, NULL, alloc_blocks, &jobs[i]);
+   }
+
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      pthread_join(jobs[i].thread, NULL);
+
+   /* Validate that the block allocations were monotonic */
+   uint32_t *block_ptrs[NUM_THREADS];
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      block_ptrs[i] = jobs[i].blocks;
+   validate_monotonic(block_ptrs);
+
+   /* Validate that the back block allocations were monotonic */
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      block_ptrs[i] = jobs[i].back_blocks;
+   validate_monotonic(block_ptrs);
+
+   anv_block_pool_finish(&pool);
+   pthread_mutex_destroy(&device.mutex);
+}
+
+int main(int argc, char **argv)
+{
+   for (unsigned i = 0; i < NUM_RUNS; i++)
+      run_test();
+}
diff --git a/src/vulkan/tests/state_pool.c b/src/vulkan/tests/state_pool.c

new file mode 100644 (file)

index 0000000..878ec19
--- /dev/null
+++ b/src/vulkan/tests/state_pool.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+
+#define NUM_THREADS 8
+#define STATES_PER_THREAD_LOG2 10
+#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2)
+#define NUM_RUNS 64
+
+#include "state_pool_test_helper.h"
+
+int main(int argc, char **argv)
+{
+   struct anv_device device;
+   struct anv_block_pool block_pool;
+   struct anv_state_pool state_pool;
+
+   pthread_mutex_init(&device.mutex, NULL);
+
+   for (unsigned i = 0; i < NUM_RUNS; i++) {
+      anv_block_pool_init(&block_pool, &device, 256);
+      anv_state_pool_init(&state_pool, &block_pool);
+
+      /* Grab one so a zero offset is impossible */
+      anv_state_pool_alloc(&state_pool, 16, 16);
+
+      run_state_pool_test(&state_pool);
+
+      anv_state_pool_finish(&state_pool);
+      anv_block_pool_finish(&block_pool);
+   }
+
+   pthread_mutex_destroy(&device.mutex);
+}
diff --git a/src/vulkan/tests/state_pool_free_list_only.c b/src/vulkan/tests/state_pool_free_list_only.c

new file mode 100644 (file)

index 0000000..2f4eb47
--- /dev/null
+++ b/src/vulkan/tests/state_pool_free_list_only.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+
+#define NUM_THREADS 8
+#define STATES_PER_THREAD_LOG2 12
+#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2)
+
+#include "state_pool_test_helper.h"
+
+int main(int argc, char **argv)
+{
+   struct anv_device device;
+   struct anv_block_pool block_pool;
+   struct anv_state_pool state_pool;
+
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_block_pool_init(&block_pool, &device, 4096);
+   anv_state_pool_init(&state_pool, &block_pool);
+
+   /* Grab one so a zero offset is impossible */
+   anv_state_pool_alloc(&state_pool, 16, 16);
+
+   /* Grab and return enough states that the state pool test below won't
+    * actually ever resize anything.
+    */
+   {
+      struct anv_state states[NUM_THREADS * STATES_PER_THREAD];
+      for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++) {
+         states[i] = anv_state_pool_alloc(&state_pool, 16, 16);
+         assert(states[i].offset != 0);
+      }
+
+      for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++)
+         anv_state_pool_free(&state_pool, states[i]);
+   }
+
+   run_state_pool_test(&state_pool);
+
+   anv_state_pool_finish(&state_pool);
+   anv_block_pool_finish(&block_pool);
+   pthread_mutex_destroy(&device.mutex);
+}
diff --git a/src/vulkan/tests/state_pool_no_free.c b/src/vulkan/tests/state_pool_no_free.c

new file mode 100644 (file)

index 0000000..4b248c2
--- /dev/null
+++ b/src/vulkan/tests/state_pool_no_free.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+
+#define NUM_THREADS 16
+#define STATES_PER_THREAD 1024
+#define NUM_RUNS 64
+
+struct job {
+   pthread_t thread;
+   unsigned id;
+   struct anv_state_pool *pool;
+   uint32_t offsets[STATES_PER_THREAD];
+} jobs[NUM_THREADS];
+
+pthread_barrier_t barrier;
+
+static void *alloc_states(void *_job)
+{
+   struct job *job = _job;
+
+   pthread_barrier_wait(&barrier);
+
+   for (unsigned i = 0; i < STATES_PER_THREAD; i++) {
+      struct anv_state state = anv_state_pool_alloc(job->pool, 16, 16);
+      job->offsets[i] = state.offset;
+   }
+
+   return NULL;
+}
+
+static void run_test()
+{
+   struct anv_device device;
+   struct anv_block_pool block_pool;
+   struct anv_state_pool state_pool;
+
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_block_pool_init(&block_pool, &device, 64);
+   anv_state_pool_init(&state_pool, &block_pool);
+
+   pthread_barrier_init(&barrier, NULL, NUM_THREADS);
+
+   for (unsigned i = 0; i < NUM_THREADS; i++) {
+      jobs[i].pool = &state_pool;
+      jobs[i].id = i;
+      pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]);
+   }
+
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      pthread_join(jobs[i].thread, NULL);
+
+   /* A list of indices, one per thread */
+   unsigned next[NUM_THREADS];
+   memset(next, 0, sizeof(next));
+
+   int highest = -1;
+   while (true) {
+      /* First, we find which thread has the highest next element */
+      int thread_max = -1;
+      int max_thread_idx = -1;
+      for (unsigned i = 0; i < NUM_THREADS; i++) {
+         if (next[i] >= STATES_PER_THREAD)
+            continue;
+
+         if (thread_max < jobs[i].offsets[next[i]]) {
+            thread_max = jobs[i].offsets[next[i]];
+            max_thread_idx = i;
+         }
+      }
+
+      /* The only way this can happen is if all of the next[] values are at
+       * BLOCKS_PER_THREAD, in which case, we're done.
+       */
+      if (thread_max == -1)
+         break;
+
+      /* That next element had better be higher than the previous highest */
+      assert(jobs[max_thread_idx].offsets[next[max_thread_idx]] > highest);
+
+      highest = jobs[max_thread_idx].offsets[next[max_thread_idx]];
+      next[max_thread_idx]++;
+   }
+
+   anv_state_pool_finish(&state_pool);
+   anv_block_pool_finish(&block_pool);
+   pthread_mutex_destroy(&device.mutex);
+}
+
+int main(int argc, char **argv)
+{
+   for (unsigned i = 0; i < NUM_RUNS; i++)
+      run_test();
+}
diff --git a/src/vulkan/tests/state_pool_test_helper.h b/src/vulkan/tests/state_pool_test_helper.h

new file mode 100644 (file)

index 0000000..0e56431
--- /dev/null
+++ b/src/vulkan/tests/state_pool_test_helper.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+struct job {
+   struct anv_state_pool *pool;
+   unsigned id;
+   pthread_t thread;
+} jobs[NUM_THREADS];
+
+pthread_barrier_t barrier;
+
+static void *alloc_states(void *void_job)
+{
+   struct job *job = void_job;
+
+   const unsigned chunk_size = 1 << (job->id % STATES_PER_THREAD_LOG2);
+   const unsigned num_chunks = STATES_PER_THREAD / chunk_size;
+
+   struct anv_state states[chunk_size];
+
+   pthread_barrier_wait(&barrier);
+
+   for (unsigned c = 0; c < num_chunks; c++) {
+      for (unsigned i = 0; i < chunk_size; i++) {
+         states[i] = anv_state_pool_alloc(job->pool, 16, 16);
+         memset(states[i].map, 139, 16);
+         assert(states[i].offset != 0);
+      }
+
+      for (unsigned i = 0; i < chunk_size; i++)
+         anv_state_pool_free(job->pool, states[i]);
+   }
+
+   return NULL;
+}
+
+static void run_state_pool_test(struct anv_state_pool *state_pool)
+{
+   pthread_barrier_init(&barrier, NULL, NUM_THREADS);
+
+   for (unsigned i = 0; i < NUM_THREADS; i++) {
+      jobs[i].pool = state_pool;
+      jobs[i].id = i;
+      pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]);
+   }
+
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      pthread_join(jobs[i].thread, NULL);
+}
author	Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
	Thu, 21 Jan 2016 19:07:09 +0000 (11:07 -0800)
committer	Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>
	Thu, 21 Jan 2016 19:09:58 +0000 (11:09 -0800)