From b642730be93149baa7556e5791393168ab396175 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Fri, 15 Feb 2008 17:35:24 +0900
Subject: Code reorganization: move files into their places.

This is in a separate commit to ensure renames are properly preserved.
---
 src/gallium/drivers/cell/spu/spu_main.h | 177 ++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 src/gallium/drivers/cell/spu/spu_main.h

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
new file mode 100644
index 0000000000..1710a17512
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -0,0 +1,177 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_MAIN_H
+#define SPU_MAIN_H
+
+
+#include <spu_mfcio.h>
+
+#include "pipe/cell/common.h"
+#include "pipe/draw/draw_vertex.h"
+#include "pipe/p_state.h"
+
+
+
+#define MAX_WIDTH 1024
+#define MAX_HEIGHT 1024
+
+
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
+struct spu_framebuffer {
+   void *color_start;              /**< addr of color surface in main memory */
+   void *depth_start;              /**< addr of depth surface in main memory */
+   enum pipe_format color_format;
+   enum pipe_format depth_format;
+   uint width, height;             /**< size in pixels */
+   uint width_tiles, height_tiles; /**< width and height in tiles */
+
+   uint color_clear_value;
+   uint depth_clear_value;
+
+   uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+} ALIGN16_ATTRIB;
+
+
+/**
+ * All SPU global/context state will be in singleton object of this type:
+ */
+struct spu_global
+{
+   struct cell_init_info init;
+
+   struct spu_framebuffer fb;
+   struct pipe_blend_state blend_stencil;
+   struct pipe_depth_stencil_alpha_state depth_stencil;
+   struct pipe_blend_state blend;
+   struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct cell_command_texture texture;
+
+   struct vertex_info vertex_info;
+
+   /* XXX more state to come */
+
+
+   /** current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
+
+   /** for converting RGBA to PIPE_FORMAT_x colors */
+   vector unsigned char color_shuffle;
+
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+
+   vector float (*sample_texture)(vector float texcoord);
+
+} ALIGN16_ATTRIB;
+
+
+extern struct spu_global spu;
+extern boolean Debug;
+
+
+
+
+/* DMA TAGS */
+
+#define TAG_SURFACE_CLEAR     10
+#define TAG_VERTEX_BUFFER     11
+#define TAG_READ_TILE_COLOR   12
+#define TAG_READ_TILE_Z       13
+#define TAG_WRITE_TILE_COLOR  14
+#define TAG_WRITE_TILE_Z      15
+#define TAG_INDEX_BUFFER      16
+#define TAG_BATCH_BUFFER      17
+#define TAG_MISC              18
+#define TAG_TEXTURE_TILE      19
+#define TAG_INSTRUCTION_FETCH 20
+
+
+
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
+
+
+static INLINE void
+memset16(ushort *d, ushort value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+static INLINE void
+memset32(uint *d, uint value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+#endif /* SPU_MAIN_H */
-- 
cgit v1.2.3


From 6acd63a4980951727939c0dd545a0324965b3834 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Fri, 15 Feb 2008 17:50:12 +0900
Subject: Code reorganization: update build.

Update the Makefiles and includes for the new paths.

Note that there hasn't been no separation of the Makefiles yet, and make is
jumping all over the place. That will be taken care shortly. But for now, make
should work. It was tested with linux and linux-dri. Linux-cell and linux-llvm
might require some minor tweaks.
---
 configs/beos                                       |   2 +-
 configs/darwin                                     |   2 +-
 configs/darwin-x86ppc                              |   2 +-
 configs/default                                    |   2 +-
 configs/freebsd-dri                                |   2 +-
 configs/linux-cell                                 |   2 +-
 configs/linux-directfb                             |   2 +-
 configs/linux-dri                                  |   6 +-
 configs/linux-dri-xcb                              |   4 +-
 configs/linux-fbdev                                |   2 +-
 configs/linux-osmesa                               |   2 +-
 configs/linux-osmesa16                             |   2 +-
 configs/linux-osmesa16-static                      |   2 +-
 configs/linux-osmesa32                             |   2 +-
 configs/linux-solo                                 |   2 +-
 src/gallium/Makefile                               |  12 +--
 src/gallium/Makefile.template                      |   7 +-
 src/gallium/aux/Makefile                           |  24 +++++
 src/gallium/aux/draw/draw_private.h                |   2 +-
 src/gallium/aux/draw/draw_vertex.c                 |   4 +-
 src/gallium/aux/draw/draw_vertex_shader.c          |   4 +-
 src/gallium/aux/llvm/Makefile                      |   4 +-
 src/gallium/aux/llvm/gallivm.cpp                   |   4 +-
 src/gallium/aux/llvm/gallivm_cpu.cpp               |   4 +-
 src/gallium/aux/llvm/tgsitollvm.cpp                |  10 +-
 src/gallium/aux/pipebuffer/Makefile                |   2 +-
 src/gallium/aux/tgsi/exec/tgsi_exec.c              |   4 +-
 src/gallium/aux/tgsi/exec/tgsi_sse2.c              |   4 +-
 src/gallium/aux/tgsi/util/tgsi_transform.h         |   4 +-
 src/gallium/drivers/Makefile                       |  24 +++++
 src/gallium/drivers/cell/ppu/Makefile              |   7 +-
 src/gallium/drivers/cell/ppu/cell_clear.c          |   2 +-
 src/gallium/drivers/cell/ppu/cell_context.c        |   6 +-
 src/gallium/drivers/cell/ppu/cell_context.h        |   6 +-
 src/gallium/drivers/cell/ppu/cell_draw_arrays.c    |   2 +-
 src/gallium/drivers/cell/ppu/cell_flush.c          |   2 +-
 src/gallium/drivers/cell/ppu/cell_render.c         |   2 +-
 src/gallium/drivers/cell/ppu/cell_spu.c            |   2 +-
 src/gallium/drivers/cell/ppu/cell_spu.h            |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_blend.c    |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_clip.c     |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_derived.c  |   4 +-
 src/gallium/drivers/cell/ppu/cell_state_fs.c       |   8 +-
 .../drivers/cell/ppu/cell_state_rasterizer.c       |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_sampler.c  |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_vertex.c   |   2 +-
 src/gallium/drivers/cell/ppu/cell_surface.c        |   2 +-
 src/gallium/drivers/cell/ppu/cell_vbuf.c           |   2 +-
 src/gallium/drivers/cell/ppu/cell_vertex_shader.c  |   6 +-
 src/gallium/drivers/cell/spu/Makefile              |   6 +-
 src/gallium/drivers/cell/spu/spu_exec.c            |   4 +-
 src/gallium/drivers/cell/spu/spu_exec.h            |   2 +-
 src/gallium/drivers/cell/spu/spu_main.c            |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h            |   4 +-
 src/gallium/drivers/cell/spu/spu_render.c          |   2 +-
 src/gallium/drivers/cell/spu/spu_render.h          |   2 +-
 src/gallium/drivers/cell/spu/spu_tile.h            |   2 +-
 src/gallium/drivers/cell/spu/spu_util.c            |   4 +-
 src/gallium/drivers/cell/spu/spu_vertex_shader.c   |   6 +-
 src/gallium/drivers/failover/Makefile              |   2 +-
 src/gallium/drivers/i915simple/Makefile            |   2 +-
 src/gallium/drivers/i915simple/i915_context.c      |   2 +-
 src/gallium/drivers/i915simple/i915_context.h      |   2 +-
 .../drivers/i915simple/i915_fpc_translate.c        |   4 +-
 src/gallium/drivers/i915simple/i915_prim_emit.c    |   2 +-
 src/gallium/drivers/i915simple/i915_prim_vbuf.c    |   2 +-
 src/gallium/drivers/i915simple/i915_state.c        |   2 +-
 .../drivers/i915simple/i915_state_derived.c        |   4 +-
 src/gallium/drivers/i915simple/i915_strings.c      |   2 +-
 src/gallium/drivers/i915simple/i915_surface.c      |   2 +-
 src/gallium/drivers/i965simple/Makefile            |   2 +-
 src/gallium/drivers/i965simple/brw_shader_info.c   |   2 +-
 src/gallium/drivers/i965simple/brw_state.c         |   2 +-
 src/gallium/drivers/i965simple/brw_strings.c       |   2 +-
 src/gallium/drivers/i965simple/brw_surface.c       |   2 +-
 src/gallium/drivers/i965simple/brw_vs_emit.c       |   2 +-
 src/gallium/drivers/i965simple/brw_wm_decl.c       |   2 +-
 src/gallium/drivers/i965simple/brw_wm_glsl.c       |   2 +-
 src/gallium/drivers/softpipe/Makefile              |   2 +-
 src/gallium/drivers/softpipe/sp_context.c          |   2 +-
 src/gallium/drivers/softpipe/sp_context.h          |   2 +-
 src/gallium/drivers/softpipe/sp_draw_arrays.c      |   2 +-
 src/gallium/drivers/softpipe/sp_flush.c            |   2 +-
 src/gallium/drivers/softpipe/sp_headers.h          |   2 +-
 src/gallium/drivers/softpipe/sp_prim_setup.c       |   4 +-
 src/gallium/drivers/softpipe/sp_prim_vbuf.c        |   6 +-
 src/gallium/drivers/softpipe/sp_quad_fs.c          |   2 +-
 src/gallium/drivers/softpipe/sp_query.c            |   2 +-
 src/gallium/drivers/softpipe/sp_state_clip.c       |   2 +-
 src/gallium/drivers/softpipe/sp_state_derived.c    |   6 +-
 src/gallium/drivers/softpipe/sp_state_fs.c         |   8 +-
 src/gallium/drivers/softpipe/sp_state_rasterizer.c |   2 +-
 src/gallium/drivers/softpipe/sp_state_sampler.c    |   4 +-
 src/gallium/drivers/softpipe/sp_state_vertex.c     |   2 +-
 src/gallium/drivers/softpipe/sp_surface.c          |   2 +-
 src/gallium/drivers/softpipe/sp_tex_sample.c       |   2 +-
 src/gallium/drivers/softpipe/sp_tile_cache.c       |   2 +-
 src/gallium/winsys/dri/Makefile                    |  38 +++++++
 src/gallium/winsys/dri/Makefile.template           | 113 +++++++++++++++++++++
 src/gallium/winsys/dri/intel/Makefile              |   8 +-
 src/gallium/winsys/dri/intel/intel_winsys_i915.c   |   2 +-
 .../winsys/dri/intel/intel_winsys_softpipe.c       |   2 +-
 src/gallium/winsys/xlib/xm_winsys.c                |   6 +-
 src/gallium/winsys/xlib/xm_winsys_aub.c            |   2 +-
 src/mesa/Makefile                                  |  16 ++-
 src/mesa/drivers/x11/xm_api.c                      |   2 +-
 src/mesa/drivers/x11/xm_dd.c                       |   2 +-
 src/mesa/drivers/x11/xm_surface.c                  |   8 +-
 src/mesa/drivers/x11/xm_winsys.c                   |   2 +-
 src/mesa/drivers/x11/xmesaP.h                      |   4 +-
 src/mesa/sources                                   |  83 +++++++--------
 src/mesa/state_tracker/st_atom_shader.c            |   2 +-
 src/mesa/state_tracker/st_cache.c                  |   4 +-
 src/mesa/state_tracker/st_cache.h                  |   2 +-
 src/mesa/state_tracker/st_cb_accum.c               |   2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c          |   2 +-
 src/mesa/state_tracker/st_cb_feedback.c            |   6 +-
 src/mesa/state_tracker/st_cb_program.c             |   4 +-
 src/mesa/state_tracker/st_cb_rasterpos.c           |   4 +-
 src/mesa/state_tracker/st_cb_readpixels.c          |   2 +-
 src/mesa/state_tracker/st_cb_texture.c             |   2 +-
 src/mesa/state_tracker/st_context.c                |   4 +-
 src/mesa/state_tracker/st_debug.c                  |   4 +-
 src/mesa/state_tracker/st_draw.c                   |   4 +-
 src/mesa/state_tracker/st_gen_mipmap.c             |   2 +-
 src/mesa/state_tracker/st_mesa_to_tgsi.c           |   6 +-
 src/mesa/state_tracker/st_program.c                |   4 +-
 127 files changed, 445 insertions(+), 241 deletions(-)
 create mode 100644 src/gallium/aux/Makefile
 create mode 100644 src/gallium/drivers/Makefile
 create mode 100644 src/gallium/winsys/dri/Makefile
 create mode 100644 src/gallium/winsys/dri/Makefile.template

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/configs/beos b/configs/beos
index f07973d0c7..2b74af739d 100644
--- a/configs/beos
+++ b/configs/beos
@@ -86,7 +86,7 @@ else
 endif
 
 # Directories
-SRC_DIRS = mesa glu glut/beos
+SRC_DIRS = gallium mesa glu glut/beos
 GLU_DIRS = sgi
 DRIVER_DIRS = beos
 PROGRAM_DIRS = beos samples redbook demos tests
diff --git a/configs/darwin b/configs/darwin
index 7826ecc605..bba7802696 100644
--- a/configs/darwin
+++ b/configs/darwin
@@ -25,5 +25,5 @@ GLW_LIB_DEPS = -L/usr/X11R6/lib -lX11 -lXt $(TOP)/lib/GL.dylib
 APP_LIB_DEPS = -L$(TOP)/lib -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -L/usr/X11R6/lib -lX11 -lXmu -lXt -lXi -lm
 
 # omit glw lib for now:
-SRC_DIRS = mesa glu glut/glx
+SRC_DIRS = gallium mesa glu glut/glx
 
diff --git a/configs/darwin-x86ppc b/configs/darwin-x86ppc
index 13172327a7..ebeb25051f 100644
--- a/configs/darwin-x86ppc
+++ b/configs/darwin-x86ppc
@@ -29,5 +29,5 @@ GLW_LIB_DEPS = -L/usr/X11R6/lib -lX11 -lXt $(TOP)/lib/GL.dylib
 APP_LIB_DEPS = -L$(TOP)/lib -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -L/usr/X11R6/lib -lX11 -lXmu -lXt -lXi -lm
 
 # omit glw lib for now:
-SRC_DIRS = mesa glu glut/glx
+SRC_DIRS = gallium mesa glu glut/glx
 
diff --git a/configs/default b/configs/default
index 166205a1d3..25a87e66a1 100644
--- a/configs/default
+++ b/configs/default
@@ -60,7 +60,7 @@ GLW_SOURCES = GLwDrawA.c
 
 # Directories to build
 LIB_DIR = lib
-SRC_DIRS = mesa glu glut/glx glw
+SRC_DIRS = gallium mesa glu glut/glx glw
 GLU_DIRS = sgi
 DRIVER_DIRS = x11 osmesa
 # Which subdirs under $(TOP)/progs/ to enter:
diff --git a/configs/freebsd-dri b/configs/freebsd-dri
index 402883d1de..67d253b869 100644
--- a/configs/freebsd-dri
+++ b/configs/freebsd-dri
@@ -36,7 +36,7 @@ GLW_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -L/usr/X11R6/lib -lGL -lXt -lX11
 
 
 # Directories
-SRC_DIRS = glx/x11 mesa glu glut/glx glw
+SRC_DIRS = glx/x11 gallium mesa glu glut/glx glw
 DRIVER_DIRS = dri
 PROGRAM_DIRS = 
 WINDOW_SYSTEM=dri
diff --git a/configs/linux-cell b/configs/linux-cell
index 3d874491e4..fdf20deeeb 100644
--- a/configs/linux-cell
+++ b/configs/linux-cell
@@ -21,7 +21,7 @@ CFLAGS = $(OPT_FLAGS) -Wall -Winline -fPIC -m32 -mabi=altivec -maltivec -I. -I$(
 CXXFLAGS = $(CFLAGS)
 
 # Omitting glw here:
-SRC_DIRS = mesa glu glut/glx
+SRC_DIRS = gallium mesa glu glut/glx
 
 
 MKDEP_OPTIONS = -fdepend -Y
diff --git a/configs/linux-directfb b/configs/linux-directfb
index 09332f4808..dff27f7850 100644
--- a/configs/linux-directfb
+++ b/configs/linux-directfb
@@ -22,7 +22,7 @@ ifeq ($(HAVE_X86), yes)
 endif
 
 # Directories
-SRC_DIRS     = mesa glu glut/directfb
+SRC_DIRS     = gallium mesa glu glut/directfb
 GLU_DIRS     = sgi
 DRIVER_DIRS  = directfb
 PROGRAM_DIRS = demos directfb
diff --git a/configs/linux-dri b/configs/linux-dri
index 936fce9982..e6135856fc 100644
--- a/configs/linux-dri
+++ b/configs/linux-dri
@@ -54,10 +54,10 @@ USING_EGL=0
 
 # Directories
 ifeq ($(USING_EGL), 1)
-SRC_DIRS = egl glx/x11 mesa glu glut/glx glw
+SRC_DIRS = egl glx/x11 gallium mesa glu glut/glx glw
 PROGRAM_DIRS = egl
 else
-SRC_DIRS = glx/x11 mesa glu glut/glx glw
+SRC_DIRS = glx/x11 gallium mesa glu glut/glx glw
 PROGRAM_DIRS =
 endif
 
@@ -66,4 +66,4 @@ WINDOW_SYSTEM=dri
 
 # gamma are missing because they have not been converted to use the new
 # interface.
-DRI_DIRS = intel_winsys 
+DRI_DIRS = intel 
diff --git a/configs/linux-dri-xcb b/configs/linux-dri-xcb
index aa292a13ec..ea4bdf1864 100644
--- a/configs/linux-dri-xcb
+++ b/configs/linux-dri-xcb
@@ -53,10 +53,10 @@ USING_EGL=0
 
 # Directories
 ifeq ($(USING_EGL), 1)
-SRC_DIRS = egl glx/x11 mesa glu glut/glx glw
+SRC_DIRS = egl glx/x11 gallium mesa glu glut/glx glw
 PROGRAM_DIRS = egl
 else
-SRC_DIRS = glx/x11 mesa glu glut/glx glw
+SRC_DIRS = glx/x11 gallium mesa glu glut/glx glw
 PROGRAM_DIRS =
 endif
 
diff --git a/configs/linux-fbdev b/configs/linux-fbdev
index e36d20a702..1ddccb3f52 100644
--- a/configs/linux-fbdev
+++ b/configs/linux-fbdev
@@ -6,7 +6,7 @@ CONFIG_NAME = linux-fbdev
 
 CFLAGS = -O3 -ffast-math -ansi -pedantic -fPIC -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DPTHREADS -DUSE_GLFBDEV_DRIVER
 
-SRC_DIRS = mesa glu glut/fbdev
+SRC_DIRS = gallium mesa glu glut/fbdev
 DRIVER_DIRS = fbdev osmesa
 PROGRAM_DIRS = fbdev demos redbook samples
 
diff --git a/configs/linux-osmesa b/configs/linux-osmesa
index cc1fbbd109..0382a19553 100644
--- a/configs/linux-osmesa
+++ b/configs/linux-osmesa
@@ -14,7 +14,7 @@ CXXFLAGS = -O3 -ansi -pedantic -fPIC -ffast-math -D_POSIX_SOURCE -D_POSIX_C_SOUR
 
 
 # Directories
-SRC_DIRS = mesa glu
+SRC_DIRS = gallium mesa glu
 DRIVER_DIRS = osmesa
 PROGRAM_DIRS = osdemos
 
diff --git a/configs/linux-osmesa16 b/configs/linux-osmesa16
index 1fb0186d31..9a527592f1 100644
--- a/configs/linux-osmesa16
+++ b/configs/linux-osmesa16
@@ -17,7 +17,7 @@ OSMESA_LIB_NAME = libOSMesa16.so
 
 
 # Directories
-SRC_DIRS = mesa glu
+SRC_DIRS = gallium mesa glu
 DRIVER_DIRS = osmesa
 PROGRAM_DIRS = 
 
diff --git a/configs/linux-osmesa16-static b/configs/linux-osmesa16-static
index 6645504478..1e6380b02e 100644
--- a/configs/linux-osmesa16-static
+++ b/configs/linux-osmesa16-static
@@ -18,7 +18,7 @@ OSMESA_LIB_NAME = libOSMesa16.a
 
 
 # Directories
-SRC_DIRS = mesa glu
+SRC_DIRS = gallium mesa glu
 DRIVER_DIRS = osmesa
 PROGRAM_DIRS = 
 
diff --git a/configs/linux-osmesa32 b/configs/linux-osmesa32
index a1e5a358d6..f0ef1831b0 100644
--- a/configs/linux-osmesa32
+++ b/configs/linux-osmesa32
@@ -17,7 +17,7 @@ OSMESA_LIB_NAME = libOSMesa32.so
 
 
 # Directories
-SRC_DIRS = mesa glu
+SRC_DIRS = gallium mesa glu
 DRIVER_DIRS = osmesa
 PROGRAM_DIRS = 
 
diff --git a/configs/linux-solo b/configs/linux-solo
index 220fe58b9a..d49b972228 100644
--- a/configs/linux-solo
+++ b/configs/linux-solo
@@ -43,7 +43,7 @@ GLUT_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GLU_LIB) -l$(GL_LIB) -lm
 APP_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -lm -lpthread
 
 # Directories
-SRC_DIRS = glx/mini mesa glu glut/mini
+SRC_DIRS = glx/mini gallium mesa glu glut/mini
 DRIVER_DIRS = dri
 PROGRAM_DIRS = miniglx
 
diff --git a/src/gallium/Makefile b/src/gallium/Makefile
index d880d090c1..a13b9a52d3 100644
--- a/src/gallium/Makefile
+++ b/src/gallium/Makefile
@@ -1,16 +1,8 @@
-TOP = ../../..
+TOP = ../..
 include $(TOP)/configs/current
 
 
-ifeq ($(CONFIG_NAME), linux-cell)
-CELL_DIR = cell
-endif
-
-ifeq ($(CONFIG_NAME), linux-llvm)
-LLVM_DIR = llvm
-endif
-
-SUBDIRS = softpipe i915simple i965simple failover pipebuffer $(CELL_DIR) $(LLVM_DIR)
+SUBDIRS = aux drivers
 
 
 default: subdirs
diff --git a/src/gallium/Makefile.template b/src/gallium/Makefile.template
index 8e84f8eb2d..0717ed8dd2 100644
--- a/src/gallium/Makefile.template
+++ b/src/gallium/Makefile.template
@@ -15,7 +15,10 @@ OBJECTS = $(C_SOURCES:.c=.o) \
 ### Include directories
 INCLUDES = \
 	-I. \
-	-I$(TOP)/src/mesa/pipe \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/include/pipe \
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/drivers \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/include \
         $(DRIVER_INCLUDES)
@@ -38,7 +41,7 @@ INCLUDES = \
 default: depend symlinks $(LIBNAME)
 
 
-$(LIBNAME): $(OBJECTS) Makefile $(TOP)/src/mesa/pipe/Makefile.template
+$(LIBNAME): $(OBJECTS) Makefile $(TOP)/src/gallium/Makefile.template
 	$(TOP)/bin/mklib -o $@ -static $(OBJECTS) $(DRIVER_LIBS)
 
 
diff --git a/src/gallium/aux/Makefile b/src/gallium/aux/Makefile
new file mode 100644
index 0000000000..da68498aa1
--- /dev/null
+++ b/src/gallium/aux/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+ifeq ($(CONFIG_NAME), linux-llvm)
+LLVM_DIR = llvm
+endif
+
+SUBDIRS = pipebuffer $(LLVM_DIR)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/aux/draw/draw_private.h b/src/gallium/aux/draw/draw_private.h
index b17eaaed65..3d09aef87c 100644
--- a/src/gallium/aux/draw/draw_private.h
+++ b/src/gallium/aux/draw/draw_private.h
@@ -45,7 +45,7 @@
 #include "pipe/p_defines.h"
 
 #include "x86/rtasm/x86sse.h"
-#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "tgsi/exec/tgsi_exec.h"
 
 
 struct gallivm_prog;
diff --git a/src/gallium/aux/draw/draw_vertex.c b/src/gallium/aux/draw/draw_vertex.c
index 2d6592150f..daf1ef4b80 100644
--- a/src/gallium/aux/draw/draw_vertex.c
+++ b/src/gallium/aux/draw/draw_vertex.c
@@ -34,8 +34,8 @@
  */
 
 
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
 
 
 /**
diff --git a/src/gallium/aux/draw/draw_vertex_shader.c b/src/gallium/aux/draw/draw_vertex_shader.c
index c824c1407e..377ecbb931 100644
--- a/src/gallium/aux/draw/draw_vertex_shader.c
+++ b/src/gallium/aux/draw/draw_vertex_shader.c
@@ -34,13 +34,13 @@
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #if defined(__i386__) || defined(__386__)
-#include "pipe/tgsi/exec/tgsi_sse2.h"
+#include "tgsi/exec/tgsi_sse2.h"
 #endif
 #include "draw_private.h"
 #include "draw_context.h"
 
 #include "x86/rtasm/x86sse.h"
-#include "pipe/llvm/gallivm.h"
+#include "llvm/gallivm.h"
 
 
 #define DBG_VS 0
diff --git a/src/gallium/aux/llvm/Makefile b/src/gallium/aux/llvm/Makefile
index 9c6e16d86b..e6ac399d08 100644
--- a/src/gallium/aux/llvm/Makefile
+++ b/src/gallium/aux/llvm/Makefile
@@ -30,7 +30,9 @@ OBJECTS = $(C_SOURCES:.c=.o) \
 ### Include directories
 INCLUDES = \
 	-I. \
-	-I$(TOP)/src/mesa/pipe \
+	-I$(TOP)/src/gallium/drivers
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/include \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/include
 
diff --git a/src/gallium/aux/llvm/gallivm.cpp b/src/gallium/aux/llvm/gallivm.cpp
index da0105c2c9..d14bb3b99a 100644
--- a/src/gallium/aux/llvm/gallivm.cpp
+++ b/src/gallium/aux/llvm/gallivm.cpp
@@ -42,8 +42,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "pipe/tgsi/exec/tgsi_exec.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_dump.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
diff --git a/src/gallium/aux/llvm/gallivm_cpu.cpp b/src/gallium/aux/llvm/gallivm_cpu.cpp
index dc4d92a72a..8f9830d0b1 100644
--- a/src/gallium/aux/llvm/gallivm_cpu.cpp
+++ b/src/gallium/aux/llvm/gallivm_cpu.cpp
@@ -42,8 +42,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "pipe/tgsi/exec/tgsi_exec.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_dump.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
diff --git a/src/gallium/aux/llvm/tgsitollvm.cpp b/src/gallium/aux/llvm/tgsitollvm.cpp
index 0de595e678..2cb4acce32 100644
--- a/src/gallium/aux/llvm/tgsitollvm.cpp
+++ b/src/gallium/aux/llvm/tgsitollvm.cpp
@@ -10,11 +10,11 @@
 
 #include "pipe/p_shader_tokens.h"
 
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/exec/tgsi_exec.h"
-#include "pipe/tgsi/util/tgsi_util.h"
-#include "pipe/tgsi/util/tgsi_build.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_build.h"
+#include "tgsi/util/tgsi_dump.h"
 
 
 #include <llvm/Module.h>
diff --git a/src/gallium/aux/pipebuffer/Makefile b/src/gallium/aux/pipebuffer/Makefile
index 75764a9a18..588629e870 100644
--- a/src/gallium/aux/pipebuffer/Makefile
+++ b/src/gallium/aux/pipebuffer/Makefile
@@ -17,7 +17,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
 
diff --git a/src/gallium/aux/tgsi/exec/tgsi_exec.c b/src/gallium/aux/tgsi/exec/tgsi_exec.c
index 37e6007068..a8f64c2287 100644
--- a/src/gallium/aux/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/aux/tgsi/exec/tgsi_exec.c
@@ -54,8 +54,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
 #include "tgsi_exec.h"
 
 #define TILE_TOP_LEFT     0
diff --git a/src/gallium/aux/tgsi/exec/tgsi_sse2.c b/src/gallium/aux/tgsi/exec/tgsi_sse2.c
index 1e56e4afb6..593464db3e 100755
--- a/src/gallium/aux/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/aux/tgsi/exec/tgsi_sse2.c
@@ -27,8 +27,8 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
 #include "tgsi_exec.h"
 #include "tgsi_sse2.h"
 
diff --git a/src/gallium/aux/tgsi/util/tgsi_transform.h b/src/gallium/aux/tgsi/util/tgsi_transform.h
index 365d8c298c..fcf85d603b 100644
--- a/src/gallium/aux/tgsi/util/tgsi_transform.h
+++ b/src/gallium/aux/tgsi/util/tgsi_transform.h
@@ -31,8 +31,8 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_build.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_build.h"
 
 
diff --git a/src/gallium/drivers/Makefile b/src/gallium/drivers/Makefile
new file mode 100644
index 0000000000..c0345a9cb5
--- /dev/null
+++ b/src/gallium/drivers/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+ifeq ($(CONFIG_NAME), linux-cell)
+CELL_DIR = cell
+endif
+
+SUBDIRS = softpipe i915simple i965simple failover pipebuffer $(CELL_DIR)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 50060f5cd3..011863c11e 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -40,8 +40,11 @@ SOURCES = \
 
 OBJECTS = $(SOURCES:.c=.o) \
 
-INCLUDE_DIRS = -I$(TOP)/src/mesa
-
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/drivers
 
 .c.o:
 	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index 07b908eec5..e588a30d5b 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -35,7 +35,7 @@
 #include <stdint.h>
 #include "pipe/p_inlines.h"
 #include "pipe/p_util.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_batch.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index bbe1fd7a11..e1eb22f468 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -37,9 +37,9 @@
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
-#include "pipe/cell/common.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "cell/common.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 3b63419b5e..6196c0c72f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -32,10 +32,10 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/draw/draw_vertex.h"
-#include "pipe/draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_vbuf.h"
 #include "cell_winsys.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 
 struct cell_vbuf_render;
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 717cd8370f..f12613649b 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -39,7 +39,7 @@
 #include "cell_draw_arrays.h"
 #include "cell_state.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index f62bc4650c..20f27531fc 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -31,7 +31,7 @@
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_render.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index 4ab277a4b2..b663b37622 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -34,7 +34,7 @@
 #include "cell_render.h"
 #include "cell_spu.h"
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_private.h"
 
 
 struct render_stage {
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 7c83a47e57..419e74dc40 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -31,7 +31,7 @@
 #include "cell_spu.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 
 /*
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 19eff94f96..137f26612e 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -31,7 +31,7 @@
 
 #include <libspe2.h>
 #include <libmisc.h>
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 #include "cell_context.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_blend.c b/src/gallium/drivers/cell/ppu/cell_state_blend.c
index 4fc60548c8..b6d6d71f0c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_blend.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_blend.c
@@ -29,7 +29,7 @@
  */
 
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_clip.c b/src/gallium/drivers/cell/ppu/cell_state_clip.c
index 4f43665941..0482f87e88 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_clip.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_clip.c
@@ -30,7 +30,7 @@
 
 #include "cell_context.h"
 #include "cell_state.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void cell_set_clip_state( struct pipe_context *pipe,
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
index 56daf5dfde..0c46829258 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_derived.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -27,8 +27,8 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
 #include "cell_context.h"
 #include "cell_batch.h"
 #include "cell_state.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_fs.c b/src/gallium/drivers/cell/ppu/cell_state_fs.c
index 3f46a87d18..b2ed699a5b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_fs.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_fs.c
@@ -29,12 +29,12 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #if 0
 #include "pipe/p_shader_tokens.h"
-#include "pipe/llvm/gallivm.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
-#include "pipe/tgsi/exec/tgsi_sse2.h"
+#include "llvm/gallivm.h"
+#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/exec/tgsi_sse2.h"
 #endif
 
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_rasterizer.c b/src/gallium/drivers/cell/ppu/cell_state_rasterizer.c
index d8128ece54..7eca5b5765 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_rasterizer.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_rasterizer.c
@@ -27,7 +27,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_sampler.c b/src/gallium/drivers/cell/ppu/cell_state_sampler.c
index ade6cc8338..a33421a4ad 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_sampler.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_sampler.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 #include "cell_texture.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
index 0f01e920f9..563831b62d 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_vertex.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -32,7 +32,7 @@
 #include "cell_context.h"
 #include "cell_state.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index fca93e4742..a35db0ef99 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -29,7 +29,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "cell_context.h"
 #include "cell_surface.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index e9fafe492e..cc727ff4ed 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -36,7 +36,7 @@
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_vbuf.h"
-#include "pipe/draw/draw_vbuf.h"
+#include "draw/draw_vbuf.h"
 
 
 /** Allow vertex data to be inlined after RENDER command */
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 80dd500b34..0ba4506505 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -38,9 +38,9 @@
 #include "cell_spu.h"
 #include "cell_batch.h"
 
-#include "pipe/cell/common.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "cell/common.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 
 /**
  * Run the vertex shader on all vertices in the vertex queue.
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index f202971d73..7aa947299e 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -31,7 +31,11 @@ SPU_OBJECTS = $(SOURCES:.c=.o) \
 
 SPU_ASM_OUT = $(SOURCES:.c=.s) \
 
-INCLUDE_DIRS = -I$(TOP)/src/mesa
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/drivers
 
 
 .c.o:
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index e51008b9b3..109540b1f7 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -67,8 +67,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
 #include "spu_exec.h"
 #include "spu_main.h"
 #include "spu_vertex_shader.h"
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
index b4c7661ef6..3e17c490d2 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.h
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -29,7 +29,7 @@
 #define SPU_EXEC_H
 
 #include "pipe/p_compiler.h"
-#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "tgsi/exec/tgsi_exec.h"
 
 #if defined __cplusplus
 extern "C" {
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index e375197fe6..1e7243b863 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -38,7 +38,7 @@
 #include "spu_tile.h"
 //#include "spu_test.h"
 #include "spu_vertex_shader.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 #include "pipe/p_defines.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 1710a17512..5c95d112ac 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -31,8 +31,8 @@
 
 #include <spu_mfcio.h>
 
-#include "pipe/cell/common.h"
-#include "pipe/draw/draw_vertex.h"
+#include "cell/common.h"
+#include "draw/draw_vertex.h"
 #include "pipe/p_state.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 932fb500b3..20e77aa2e6 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -34,7 +34,7 @@
 #include "spu_render.h"
 #include "spu_tri.h"
 #include "spu_tile.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.h b/src/gallium/drivers/cell/spu/spu_render.h
index fbcdc5ec31..493434f087 100644
--- a/src/gallium/drivers/cell/spu/spu_render.h
+++ b/src/gallium/drivers/cell/spu/spu_render.h
@@ -29,7 +29,7 @@
 #ifndef SPU_RENDER_H
 #define SPU_RENDER_H
 
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 extern void
 cmd_render(const struct cell_command_render *render, uint *pos_incr);
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index e53340a55a..3105b848fd 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -32,7 +32,7 @@
 #include <libmisc.h>
 #include <spu_mfcio.h>
 #include "spu_main.h"
-#include "pipe/cell/common.h"
+#include "cell/common.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index ac373240c1..ea4274a0a7 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,8 +1,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 //#include "tgsi_build.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_util.h"
 
 unsigned
 tgsi_util_get_src_register_swizzle(
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index c1cbbb6d1e..3f5bf41aa2 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -39,9 +39,9 @@
 #include "pipe/p_shader_tokens.h"
 #include "spu_vertex_shader.h"
 #include "spu_exec.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/cell/common.h"
+#include "draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "cell/common.h"
 #include "spu_main.h"
 
 static INLINE unsigned
diff --git a/src/gallium/drivers/failover/Makefile b/src/gallium/drivers/failover/Makefile
index 72d0895c74..14389bd055 100644
--- a/src/gallium/drivers/failover/Makefile
+++ b/src/gallium/drivers/failover/Makefile
@@ -15,7 +15,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
 
diff --git a/src/gallium/drivers/i915simple/Makefile b/src/gallium/drivers/i915simple/Makefile
index 2f91de3afc..ee22ba86f9 100644
--- a/src/gallium/drivers/i915simple/Makefile
+++ b/src/gallium/drivers/i915simple/Makefile
@@ -32,7 +32,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
 
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index 497623a700..7f71f8fd4f 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -32,7 +32,7 @@
 #include "i915_texture.h"
 #include "i915_reg.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
diff --git a/src/gallium/drivers/i915simple/i915_context.h b/src/gallium/drivers/i915simple/i915_context.h
index b4ea63c3e7..2d876925b2 100644
--- a/src/gallium/drivers/i915simple/i915_context.h
+++ b/src/gallium/drivers/i915simple/i915_context.h
@@ -33,7 +33,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_vertex.h"
 
 
 #define I915_TEX_UNITS 8
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
index 868f0c7e04..6c1524c768 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -33,9 +33,9 @@
 #include "i915_fpc.h"
 
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_vertex.h"
 
 
 /**
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index c4a706c37d..44c4325936 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_private.h"
 #include "pipe/p_util.h"
 
 #include "i915_context.h"
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index e069773fd4..c5bf6174f6 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -38,7 +38,7 @@
  */
 
 
-#include "pipe/draw/draw_vbuf.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index abd5571b88..294e6fad03 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -29,7 +29,7 @@
  */
 
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
 
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
index 653983e4a9..4767584fc6 100644
--- a/src/gallium/drivers/i915simple/i915_state_derived.c
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -27,8 +27,8 @@
 
 
 #include "pipe/p_util.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
 #include "i915_context.h"
 #include "i915_state.h"
 #include "i915_reg.h"
diff --git a/src/gallium/drivers/i915simple/i915_strings.c b/src/gallium/drivers/i915simple/i915_strings.c
index c713bf7208..301fedea19 100644
--- a/src/gallium/drivers/i915simple/i915_strings.c
+++ b/src/gallium/drivers/i915simple/i915_strings.c
@@ -70,7 +70,7 @@ static const char *i915_get_name( struct pipe_context *pipe )
       break;
    }
 
-   sprintf(buffer, "pipe/i915 (chipset: %s)", chipset);
+   sprintf(buffer, "i915 (chipset: %s)", chipset);
    return buffer;
 }
 
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c
index de0cc5fe06..17fd27895a 100644
--- a/src/gallium/drivers/i915simple/i915_surface.c
+++ b/src/gallium/drivers/i915simple/i915_surface.c
@@ -33,7 +33,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 
 
 /*
diff --git a/src/gallium/drivers/i965simple/Makefile b/src/gallium/drivers/i965simple/Makefile
index 48c00ab50b..1dec1f9749 100644
--- a/src/gallium/drivers/i965simple/Makefile
+++ b/src/gallium/drivers/i965simple/Makefile
@@ -61,6 +61,6 @@ ASM_SOURCES =
 
 DRIVER_DEFINES = -I.
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
index 431b45466a..a762a870fe 100644
--- a/src/gallium/drivers/i965simple/brw_shader_info.c
+++ b/src/gallium/drivers/i965simple/brw_shader_info.c
@@ -3,7 +3,7 @@
 #include "brw_state.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
index 95dfce88e4..f746d1cc57 100644
--- a/src/gallium/drivers/i965simple/brw_state.c
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -33,7 +33,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/util/tgsi_dump.h"
 
 #include "brw_context.h"
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965simple/brw_strings.c b/src/gallium/drivers/i965simple/brw_strings.c
index 29a41ed1e9..3d9c50961f 100644
--- a/src/gallium/drivers/i965simple/brw_strings.c
+++ b/src/gallium/drivers/i965simple/brw_strings.c
@@ -59,7 +59,7 @@ static const char *brw_get_name( struct pipe_context *pipe )
       break;
    }
 
-   sprintf(buffer, "pipe/i965 (chipset: %s)", chipset);
+   sprintf(buffer, "i965 (chipset: %s)", chipset);
    return buffer;
 }
 
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
index 518845e4b2..376a42b1a6 100644
--- a/src/gallium/drivers/i965simple/brw_surface.c
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -32,7 +32,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 
 
 /*
diff --git a/src/gallium/drivers/i965simple/brw_vs_emit.c b/src/gallium/drivers/i965simple/brw_vs_emit.c
index 98915ba101..05df4860ed 100644
--- a/src/gallium/drivers/i965simple/brw_vs_emit.c
+++ b/src/gallium/drivers/i965simple/brw_vs_emit.c
@@ -33,7 +33,7 @@
 #include "brw_vs.h"
 
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
 struct brw_prog_info {
    unsigned num_temps;
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
index b45a333a2e..97418a52e7 100644
--- a/src/gallium/drivers/i965simple/brw_wm_decl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_decl.c
@@ -4,7 +4,7 @@
 #include "brw_wm.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
index d95645d108..44f946ea74 100644
--- a/src/gallium/drivers/i965simple/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -4,7 +4,7 @@
 #include "brw_wm.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_parse.h"
 
 
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 31438a882e..2304ea4246 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -44,7 +44,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../../Makefile.template
 
 symlinks:
 
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index cea6b90104..5e98f190bb 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -29,7 +29,7 @@
  *    Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_util.h"
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index aff8c2cc5d..8c79cb3ce4 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -34,7 +34,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_vertex.h"
 
 #include "sp_quad.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 71a303a8b5..2049afda34 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -38,7 +38,7 @@
 #include "sp_context.h"
 #include "sp_state.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index ced0d5d098..2cbd0d7cab 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -31,7 +31,7 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "sp_flush.h"
 #include "sp_context.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_headers.h b/src/gallium/drivers/softpipe/sp_headers.h
index 0ae31d8796..9cf8222133 100644
--- a/src/gallium/drivers/softpipe/sp_headers.h
+++ b/src/gallium/drivers/softpipe/sp_headers.h
@@ -31,7 +31,7 @@
 #ifndef SP_HEADERS_H
 #define SP_HEADERS_H
 
-#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "tgsi/exec/tgsi_exec.h"
 
 #define PRIM_POINT 1
 #define PRIM_LINE  2
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
index 2772048661..d73521ccbe 100644
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.c
@@ -38,8 +38,8 @@
 #include "sp_quad.h"
 #include "sp_state.h"
 #include "sp_prim_setup.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 7f71fdb6a9..69bea8a8f5 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -39,9 +39,9 @@
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_prim_vbuf.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vbuf.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
 
 
 #define SP_MAX_VBUF_INDEXES 1024
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 3316858413..b4c01a7ea8 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -42,7 +42,7 @@
 #include "x86/rtasm/x86sse.h"
 
 #ifdef MESA_LLVM
-#include "pipe/llvm/gallivm.h"
+#include "llvm/gallivm.h"
 #endif
 
 #include "sp_context.h"
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 6a8a43aeda..adf9ccf64c 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -29,7 +29,7 @@
  *    Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_util.h"
diff --git a/src/gallium/drivers/softpipe/sp_state_clip.c b/src/gallium/drivers/softpipe/sp_state_clip.c
index 08c5f06d05..c797c0dd3b 100644
--- a/src/gallium/drivers/softpipe/sp_state_clip.c
+++ b/src/gallium/drivers/softpipe/sp_state_clip.c
@@ -29,7 +29,7 @@
  */
 #include "sp_context.h"
 #include "sp_state.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void softpipe_set_clip_state( struct pipe_context *pipe,
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 372597869f..9d8fd8b750 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -27,9 +27,9 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_vertex.h"
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_private.h"
 #include "sp_context.h"
 #include "sp_state.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 0b814fc284..1e3cadd43d 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -32,11 +32,11 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/llvm/gallivm.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
-#include "pipe/tgsi/exec/tgsi_sse2.h"
+#include "llvm/gallivm.h"
+#include "tgsi/util/tgsi_dump.h"
+#include "tgsi/exec/tgsi_sse2.h"
 
 
 void *
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
index 53755099dd..98e04352db 100644
--- a/src/gallium/drivers/softpipe/sp_state_rasterizer.c
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -29,7 +29,7 @@
 #include "pipe/p_util.h"
 #include "sp_context.h"
 #include "sp_state.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index ea348c7e95..460adccec4 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -31,14 +31,14 @@
 
 #include "pipe/p_util.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 #include "sp_context.h"
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
 #include "sp_tile_cache.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c
index 09ff540ccf..f01a10de3b 100644
--- a/src/gallium/drivers/softpipe/sp_state_vertex.c
+++ b/src/gallium/drivers/softpipe/sp_state_vertex.c
@@ -33,7 +33,7 @@
 #include "sp_state.h"
 #include "sp_surface.h"
 
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_context.h"
 
 
 void
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
index 8802ced187..653449c4f1 100644
--- a/src/gallium/drivers/softpipe/sp_surface.c
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -29,7 +29,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "sp_context.h"
 #include "sp_surface.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 325bdb86da..2f82fd6abe 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -40,7 +40,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
-#include "pipe/tgsi/exec/tgsi_exec.h"
+#include "tgsi/exec/tgsi_exec.h"
 
 
 /*
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 1597361b82..dde3fabc81 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -34,7 +34,7 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "sp_context.h"
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
diff --git a/src/gallium/winsys/dri/Makefile b/src/gallium/winsys/dri/Makefile
new file mode 100644
index 0000000000..f466ce6c3c
--- /dev/null
+++ b/src/gallium/winsys/dri/Makefile
@@ -0,0 +1,38 @@
+# src/mesa/drivers/dri/Makefile
+
+TOP = ../../../..
+
+include $(TOP)/configs/current
+
+
+
+default: $(TOP)/$(LIB_DIR) subdirs
+
+
+$(TOP)/$(LIB_DIR):
+	-mkdir $(TOP)/$(LIB_DIR)
+
+
+subdirs:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+install:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) install) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) clean) ; \
+		fi \
+	done
+	-rm -f common/*.o
diff --git a/src/gallium/winsys/dri/Makefile.template b/src/gallium/winsys/dri/Makefile.template
new file mode 100644
index 0000000000..b96305c094
--- /dev/null
+++ b/src/gallium/winsys/dri/Makefile.template
@@ -0,0 +1,113 @@
+# -*-makefile-*-
+
+MESA_MODULES = $(TOP)/src/mesa/libmesa.a
+
+COMMON_GALLIUM_SOURCES = \
+        $(TOP)/src/mesa/drivers/dri/common/utils.c \
+        $(TOP)/src/mesa/drivers/dri/common/vblank.c \
+        $(TOP)/src/mesa/drivers/dri/common/dri_util.c \
+        $(TOP)/src/mesa/drivers/dri/common/xmlconfig.c
+
+COMMON_SOURCES = $(COMMON_GALLIUM_SOURCES) \
+        $(TOP)/src/mesa/drivers/common/driverfuncs.c \
+        $(TOP)/src/mesa/drivers/dri/common/texmem.c \
+        $(TOP)/src/mesa/drivers/dri/common/drirenderbuffer.c
+
+COMMON_BM_SOURCES = \
+	$(TOP)/src/mesa/drivers/dri/common/dri_bufmgr.c \
+	$(TOP)/src/mesa/drivers/dri/common/dri_drmpool.c
+
+
+ifeq ($(WINDOW_SYSTEM),dri)
+WINOBJ=
+WINLIB=
+INCLUDES = $(SHARED_INCLUDES) $(EXPAT_INCLUDES)
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+
+else
+# miniglx
+WINOBJ=
+WINLIB=-L$(MESA)/src/glx/mini
+MINIGLX_INCLUDES = -I$(TOP)/src/glx/mini
+INCLUDES = $(MINIGLX_INCLUDES) \
+	   $(SHARED_INCLUDES) \
+	   $(PCIACCESS_CFLAGS)
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(MINIGLX_SOURCES:.c=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+endif
+
+
+### Include directories
+SHARED_INCLUDES = \
+	-I. \
+	-I$(TOP)/src/mesa/drivers/dri/common \
+	-Iserver \
+	-I$(TOP)/include \
+	-I$(TOP)/include/GL/internal \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/aux \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/mesa/glapi \
+	-I$(TOP)/src/mesa/math \
+	-I$(TOP)/src/mesa/transform \
+	-I$(TOP)/src/mesa/shader \
+	-I$(TOP)/src/mesa/swrast \
+	-I$(TOP)/src/mesa/swrast_setup \
+	-I$(TOP)/src/egl/main \
+	-I$(TOP)/src/egl/drivers/dri \
+	$(LIBDRM_CFLAGS)
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
+
+
+##### TARGETS #####
+
+default: depend symlinks $(LIBNAME) $(TOP)/$(LIB_DIR)/$(LIBNAME)
+
+
+$(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(PIPE_DRIVERS) $(WINOBJ) Makefile $(TOP)/src/mesa/drivers/dri/Makefile.template
+	$(TOP)/bin/mklib -noprefix -o $@ \
+		$(OBJECTS) $(PIPE_DRIVERS) $(MESA_MODULES)  $(WINOBJ) $(DRI_LIB_DEPS)
+
+
+$(TOP)/$(LIB_DIR)/$(LIBNAME): $(LIBNAME)
+	$(INSTALL) $(LIBNAME) $(TOP)/$(LIB_DIR) 
+
+
+depend: $(C_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
+		$(ASM_SOURCES) 2> /dev/null
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+# Remove .o and backup files
+clean:
+	-rm -f *.o */*.o *~ *.so *~ server/*.o $(SYMLINKS)
+	-rm -f depend depend.bak
+
+
+install: $(LIBNAME)
+	$(INSTALL) -d $(DRI_DRIVER_INSTALL_DIR)
+	$(INSTALL) -m 755 $(LIBNAME) $(DRI_DRIVER_INSTALL_DIR)
+
+
+include depend
diff --git a/src/gallium/winsys/dri/intel/Makefile b/src/gallium/winsys/dri/intel/Makefile
index 9ae0f01325..40654bb2ac 100644
--- a/src/gallium/winsys/dri/intel/Makefile
+++ b/src/gallium/winsys/dri/intel/Makefile
@@ -7,8 +7,8 @@ LIBNAME = i915tex_dri.so
 MINIGLX_SOURCES = server/intel_dri.c
 
 PIPE_DRIVERS = \
-	$(TOP)/src/mesa/pipe/softpipe/libsoftpipe.a \
-	$(TOP)/src/mesa/pipe/i915simple/libi915simple.a
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/i915simple/libi915simple.a
 
 DRIVER_SOURCES = \
 	intel_winsys_pipe.c \
@@ -28,11 +28,11 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-DRIVER_DEFINES = -I../intel $(shell pkg-config libdrm --atleast-version=2.3.1 \
+DRIVER_DEFINES = -I$(TOP)/src/mesa/drivers/dri/intel $(shell pkg-config libdrm --atleast-version=2.3.1 \
 				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
 
 include ../Makefile.template
 
-intel_tex_layout.o: ../intel/intel_tex_layout.c
+intel_tex_layout.o: $(TOP)/src/mesa/drivers/dri/intel/intel_tex_layout.c
 
 symlinks:
diff --git a/src/gallium/winsys/dri/intel/intel_winsys_i915.c b/src/gallium/winsys/dri/intel/intel_winsys_i915.c
index 1ba6a9e1b2..0ed3890e93 100644
--- a/src/gallium/winsys/dri/intel/intel_winsys_i915.c
+++ b/src/gallium/winsys/dri/intel/intel_winsys_i915.c
@@ -39,7 +39,7 @@
 #include "intel_winsys.h"
 
 #include "pipe/p_util.h"
-#include "pipe/i915simple/i915_winsys.h"
+#include "i915simple/i915_winsys.h"
 
 
 struct intel_i915_winsys {
diff --git a/src/gallium/winsys/dri/intel/intel_winsys_softpipe.c b/src/gallium/winsys/dri/intel/intel_winsys_softpipe.c
index cec3437831..9e483bdc9f 100644
--- a/src/gallium/winsys/dri/intel/intel_winsys_softpipe.c
+++ b/src/gallium/winsys/dri/intel/intel_winsys_softpipe.c
@@ -34,7 +34,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_util.h"
 #include "pipe/p_format.h"
-#include "pipe/softpipe/sp_winsys.h"
+#include "softpipe/sp_winsys.h"
 
 
 struct intel_softpipe_winsys {
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index c3cd22eea3..8da596d419 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -41,11 +41,11 @@
 #include "pipe/p_context.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "pipe/softpipe/sp_winsys.h"
+#include "softpipe/sp_winsys.h"
 
 #ifdef GALLIUM_CELL
-#include "pipe/cell/ppu/cell_context.h"
-#include "pipe/cell/ppu/cell_winsys.h"
+#include "cell/ppu/cell_context.h"
+#include "cell/ppu/cell_winsys.h"
 #else
 #define TILE_SIZE 32  /* avoid compilation errors */
 #endif
diff --git a/src/gallium/winsys/xlib/xm_winsys_aub.c b/src/gallium/winsys/xlib/xm_winsys_aub.c
index bf41570257..dbfd37bda2 100644
--- a/src/gallium/winsys/xlib/xm_winsys_aub.c
+++ b/src/gallium/winsys/xlib/xm_winsys_aub.c
@@ -39,7 +39,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "pipe/i965simple/brw_winsys.h"
+#include "i965simple/brw_winsys.h"
 #include "brw_aub.h"
 #include "xm_winsys_aub.h"
 
diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index 720f1b2e02..561608fedd 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -12,16 +12,16 @@ GL_TINY = 0$(MESA_MAJOR)0$(MESA_MINOR)0$(MESA_TINY)
 
 
 PIPE_LIB = \
-	$(TOP)/src/mesa/pipe/softpipe/libsoftpipe.a \
-	$(TOP)/src/mesa/pipe/i965simple/libi965simple.a
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/i965simple/libi965simple.a
 
 ifeq ($(CONFIG_NAME), linux-cell)
-CELL_LIB = $(TOP)/src/mesa/pipe/cell/ppu/libcell.a
-CELL_LIB_SPU = $(TOP)/src/mesa/pipe/cell/spu/g3d_spu.a
+CELL_LIB = $(TOP)/src/gallium/drivers/cell/ppu/libcell.a
+CELL_LIB_SPU = $(TOP)/src/gallium/drivers/cell/spu/g3d_spu.a
 endif
 
 ifeq ($(CONFIG_NAME), linux-llvm)
-LLVM_LIB = $(TOP)/src/mesa/pipe/llvm/libgallivm.a
+LLVM_LIB = $(TOP)/src/gallium/aux/llvm/libgallivm.a
 endif
 
 .SUFFIXES : .cpp
@@ -71,7 +71,7 @@ libmesa.a: $(SOLO_OBJECTS)
 	fi
 
 linux-solo: depend subdirs libmesa.a
-	cd drivers/dri ; $(MAKE)
+	cd $(TOP)/src/gallium/winsys/dri ; $(MAKE)
 
 
 #####################################################################
@@ -165,7 +165,6 @@ depend: $(ALL_SOURCES)
 subdirs:
 	@ (cd x86 ; $(MAKE))
 	@ (cd x86-64 ; $(MAKE))
-	(cd pipe ; $(MAKE))
 
 install: default
 	$(INSTALL) -d $(INSTALL_DIR)/include/GL
@@ -178,7 +177,7 @@ install: default
 		$(INSTALL) $(TOP)/$(LIB_DIR)/libOSMesa* $(INSTALL_DIR)/$(LIB_DIR); \
 	fi
 	@if [ "${DRIVER_DIRS}" = "dri" ] ; then \
-		cd drivers/dri ; $(MAKE) install ; \
+		cd $(TOP)/gallium/winsys/dri ; $(MAKE) install ; \
 	fi
 
 ## NOT INSTALLED YET:
@@ -198,7 +197,6 @@ clean:
 	(cd drivers/dri && $(MAKE) clean)
 	(cd x86 && $(MAKE) clean)
 	(cd x86-64 && $(MAKE) clean)
-	(cd pipe ; $(MAKE) clean )
 
 
 include depend
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index 08c98eab48..18b033666f 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -85,7 +85,7 @@
 
 #include "state_tracker/st_public.h"
 #include "state_tracker/st_context.h"
-#include "pipe/softpipe/sp_context.h"
+#include "softpipe/sp_context.h"
 #include "pipe/p_defines.h"
 
 /**
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index 8ae243ae66..34287effe1 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -53,7 +53,7 @@
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
 
-#include "pipe/softpipe/sp_context.h"
+#include "softpipe/sp_context.h"
 #include "state_tracker/st_public.h"
 #include "state_tracker/st_context.h"
 #include "state_tracker/st_draw.h"
diff --git a/src/mesa/drivers/x11/xm_surface.c b/src/mesa/drivers/x11/xm_surface.c
index 5533158ece..81616b92d9 100644
--- a/src/mesa/drivers/x11/xm_surface.c
+++ b/src/mesa/drivers/x11/xm_surface.c
@@ -45,10 +45,10 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/softpipe/sp_context.h"
-#include "pipe/softpipe/sp_clear.h"
-#include "pipe/softpipe/sp_tile_cache.h"
-#include "pipe/softpipe/sp_surface.h"
+#include "softpipe/sp_context.h"
+#include "softpipe/sp_clear.h"
+#include "softpipe/sp_tile_cache.h"
+#include "softpipe/sp_surface.h"
 #include "state_tracker/st_context.h"
 
 
diff --git a/src/mesa/drivers/x11/xm_winsys.c b/src/mesa/drivers/x11/xm_winsys.c
index a690df2772..2edc697693 100644
--- a/src/mesa/drivers/x11/xm_winsys.c
+++ b/src/mesa/drivers/x11/xm_winsys.c
@@ -38,7 +38,7 @@
 #include "main/macros.h"
 
 #include "pipe/p_winsys.h"
-#include "pipe/softpipe/sp_winsys.h"
+#include "softpipe/sp_winsys.h"
 
 
 /**
diff --git a/src/mesa/drivers/x11/xmesaP.h b/src/mesa/drivers/x11/xmesaP.h
index 4709d63394..fd2dfcd79a 100644
--- a/src/mesa/drivers/x11/xmesaP.h
+++ b/src/mesa/drivers/x11/xmesaP.h
@@ -37,8 +37,8 @@
 #include "xm_image.h"
 #endif
 #include "state_tracker/st_cb_fbo.h"
-#include "pipe/softpipe/sp_context.h"
-#include "pipe/softpipe/sp_surface.h"
+#include "softpipe/sp_context.h"
+#include "softpipe/sp_surface.h"
 
 
 extern _glthread_Mutex _xmesa_lock;
diff --git a/src/mesa/sources b/src/mesa/sources
index 1165425183..2d07738210 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -158,45 +158,45 @@ VF_SOURCES = \
 
 
 DRAW_SOURCES = \
-	pipe/draw/draw_clip.c \
-	pipe/draw/draw_context.c\
-	pipe/draw/draw_cull.c \
-	pipe/draw/draw_debug.c \
-	pipe/draw/draw_flatshade.c \
-	pipe/draw/draw_offset.c \
-	pipe/draw/draw_prim.c \
-	pipe/draw/draw_stipple.c \
-	pipe/draw/draw_twoside.c \
-	pipe/draw/draw_unfilled.c \
-	pipe/draw/draw_validate.c \
-	pipe/draw/draw_vbuf.c \
-	pipe/draw/draw_vertex.c \
-	pipe/draw/draw_vertex_cache.c \
-	pipe/draw/draw_vertex_fetch.c \
-	pipe/draw/draw_vertex_shader.c \
-	pipe/draw/draw_vf.c \
-	pipe/draw/draw_vf_generic.c \
-	pipe/draw/draw_vf_sse.c \
-	pipe/draw/draw_wide_prims.c
+	$(TOP)/src/gallium/aux/draw/draw_clip.c \
+	$(TOP)/src/gallium/aux/draw/draw_context.c\
+	$(TOP)/src/gallium/aux/draw/draw_cull.c \
+	$(TOP)/src/gallium/aux/draw/draw_debug.c \
+	$(TOP)/src/gallium/aux/draw/draw_flatshade.c \
+	$(TOP)/src/gallium/aux/draw/draw_offset.c \
+	$(TOP)/src/gallium/aux/draw/draw_prim.c \
+	$(TOP)/src/gallium/aux/draw/draw_stipple.c \
+	$(TOP)/src/gallium/aux/draw/draw_twoside.c \
+	$(TOP)/src/gallium/aux/draw/draw_unfilled.c \
+	$(TOP)/src/gallium/aux/draw/draw_validate.c \
+	$(TOP)/src/gallium/aux/draw/draw_vbuf.c \
+	$(TOP)/src/gallium/aux/draw/draw_vertex.c \
+	$(TOP)/src/gallium/aux/draw/draw_vertex_cache.c \
+	$(TOP)/src/gallium/aux/draw/draw_vertex_fetch.c \
+	$(TOP)/src/gallium/aux/draw/draw_vertex_shader.c \
+	$(TOP)/src/gallium/aux/draw/draw_vf.c \
+	$(TOP)/src/gallium/aux/draw/draw_vf_generic.c \
+	$(TOP)/src/gallium/aux/draw/draw_vf_sse.c \
+	$(TOP)/src/gallium/aux/draw/draw_wide_prims.c
 
 TGSIEXEC_SOURCES = \
-	pipe/tgsi/exec/tgsi_exec.c \
-	pipe/tgsi/exec/tgsi_sse2.c
+	$(TOP)/src/gallium/aux/tgsi/exec/tgsi_exec.c \
+	$(TOP)/src/gallium/aux/tgsi/exec/tgsi_sse2.c
 
 TGSIUTIL_SOURCES = \
-	pipe/tgsi/util/tgsi_build.c \
-	pipe/tgsi/util/tgsi_dump.c \
-	pipe/tgsi/util/tgsi_parse.c \
-	pipe/tgsi/util/tgsi_util.c
+	$(TOP)/src/gallium/aux/tgsi/util/tgsi_build.c \
+	$(TOP)/src/gallium/aux/tgsi/util/tgsi_dump.c \
+	$(TOP)/src/gallium/aux/tgsi/util/tgsi_parse.c \
+	$(TOP)/src/gallium/aux/tgsi/util/tgsi_util.c
 
 STATECACHE_SOURCES = \
-	pipe/cso_cache/cso_hash.c \
-	pipe/cso_cache/cso_cache.c
+	$(TOP)/src/gallium/aux/cso_cache/cso_hash.c \
+	$(TOP)/src/gallium/aux/cso_cache/cso_cache.c
 
 PIPEUTIL_SOURCES = \
-	pipe/util/p_debug.c \
-	pipe/util/p_tile.c \
-	pipe/util/p_util.c
+	$(TOP)/src/gallium/aux/util/p_debug.c \
+	$(TOP)/src/gallium/aux/util/p_tile.c \
+	$(TOP)/src/gallium/aux/util/p_util.c
 
 STATETRACKER_SOURCES = \
 	state_tracker/st_atom.c \
@@ -331,13 +331,13 @@ __COMMON_DRIVER_SOURCES =			\
 	drivers/common/driverfuncs.c
 
 X11_DRIVER_SOURCES =		\
-	pipe/xlib/glxapi.c	\
-	pipe/xlib/fakeglx.c	\
-	pipe/xlib/xfonts.c	\
-	pipe/xlib/xm_api.c	\
-	pipe/xlib/xm_winsys.c	\
-	pipe/xlib/xm_winsys_aub.c	\
-	pipe/xlib/brw_aub.c
+	$(TOP)/src/gallium/winsys/xlib/glxapi.c	\
+	$(TOP)/src/gallium/winsys/xlib/fakeglx.c	\
+	$(TOP)/src/gallium/winsys/xlib/xfonts.c	\
+	$(TOP)/src/gallium/winsys/xlib/xm_api.c	\
+	$(TOP)/src/gallium/winsys/xlib/xm_winsys.c	\
+	$(TOP)/src/gallium/winsys/xlib/xm_winsys_aub.c	\
+	$(TOP)/src/gallium/winsys/xlib/brw_aub.c
 
 OSMESA_DRIVER_SOURCES = \
 	drivers/osmesa/osmesa.c
@@ -425,7 +425,10 @@ FBDEV_DRIVER_OBJECTS = $(FBDEV_DRIVER_SOURCES:.c=.o)
 INCLUDE_DIRS = \
 	-I$(TOP)/include \
 	-I$(TOP)/src/mesa \
-	-I$(TOP)/src/mesa/main
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/aux
 
 OLD_INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa/tnl \
@@ -435,4 +438,4 @@ OLD_INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa/shader \
 	-I$(TOP)/src/mesa/shader/grammar \
 	-I$(TOP)/src/mesa/shader/slang \
-	-I$(TOP)/src/mesa/pipe/tgsi
+	-I$(TOP)/s$(TOP)/src/gallium/aux/tgsi
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 2c6ec8421b..b67b620eaa 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -43,7 +43,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
 #include "st_context.h"
 #include "st_cache.h"
diff --git a/src/mesa/state_tracker/st_cache.c b/src/mesa/state_tracker/st_cache.c
index e0965b217a..2979e7fae5 100644
--- a/src/mesa/state_tracker/st_cache.c
+++ b/src/mesa/state_tracker/st_cache.c
@@ -36,8 +36,8 @@
 
 #include "pipe/p_state.h"
 
-#include "pipe/cso_cache/cso_cache.h"
-#include "pipe/cso_cache/cso_hash.h"
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
 
 
 /* Those function will either find the state of the given template
diff --git a/src/mesa/state_tracker/st_cache.h b/src/mesa/state_tracker/st_cache.h
index e0c176b0ff..b81de316ec 100644
--- a/src/mesa/state_tracker/st_cache.h
+++ b/src/mesa/state_tracker/st_cache.h
@@ -33,7 +33,7 @@
 #ifndef ST_CACHE_H
 #define ST_CACHE_H
 
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
 struct pipe_blend_state;
 struct pipe_sampler_state;
diff --git a/src/mesa/state_tracker/st_cb_accum.c b/src/mesa/state_tracker/st_cb_accum.c
index 3a3bf9016d..663c4f205d 100644
--- a/src/mesa/state_tracker/st_cb_accum.c
+++ b/src/mesa/state_tracker/st_cb_accum.c
@@ -43,7 +43,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 
 
 #define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index f13199a3c0..e2d4e06da1 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -56,7 +56,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "shader/prog_instruction.h"
 
 
diff --git a/src/mesa/state_tracker/st_cb_feedback.c b/src/mesa/state_tracker/st_cb_feedback.c
index 31744151f1..5315294c07 100644
--- a/src/mesa/state_tracker/st_cb_feedback.c
+++ b/src/mesa/state_tracker/st_cb_feedback.c
@@ -53,10 +53,10 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 
 
 /**
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index af3ee65504..61d4f4c41c 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -39,8 +39,8 @@
 #include "shader/programopt.h"
 #include "shader/shader_api.h"
 
-#include "pipe/cso_cache/cso_cache.h"
-#include "pipe/draw/draw_context.h"
+#include "cso_cache/cso_cache.h"
+#include "draw/draw_context.h"
 
 #include "st_context.h"
 #include "st_program.h"
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index 7e347c4893..5b0eb6022b 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -44,8 +44,8 @@
 #include "st_draw.h"
 #include "st_cb_rasterpos.h"
 #include "st_draw.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
 #include "shader/prog_instruction.h"
 #include "vbo/vbo.h"
 
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 868c5f3c5f..c89c74229e 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -40,7 +40,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 #include "st_context.h"
 #include "st_cb_readpixels.h"
 #include "st_cb_fbo.h"
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 91a40288cc..03dbb30b0f 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -47,7 +47,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/util/p_tile.h"
+#include "util/p_tile.h"
 
 
 #define DBG if (0) printf
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index bf4618bed8..09e389f9dc 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -54,8 +54,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_inlines.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/cso_cache/cso_cache.h"
+#include "draw/draw_context.h"
+#include "cso_cache/cso_cache.h"
 
 
 /**
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 57450e52bf..5888bcb98a 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -31,9 +31,9 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "tgsi/util/tgsi_dump.h"
 
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
 #include "st_context.h"
 #include "st_debug.h"
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index ae9f5c8b11..1c0fa8c6aa 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -47,8 +47,8 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_inlines.h"
 
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_context.h"
 
 
 static GLuint double_types[4] = {
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index 459941cca8..6c09b86033 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -37,7 +37,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/cso_cache/cso_cache.h"
+#include "cso_cache/cso_cache.h"
 
 #include "st_context.h"
 #include "st_draw.h"
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 325aa20173..97206752af 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -32,9 +32,9 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/tgsi/util/tgsi_parse.h"
-#include "pipe/tgsi/util/tgsi_build.h"
-#include "pipe/tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_build.h"
+#include "tgsi/util/tgsi_util.h"
 #include "st_mesa_to_tgsi.h"
 #include "shader/prog_instruction.h"
 #include "shader/prog_parameter.h"
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index c8297baded..dc992ee9c2 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -38,8 +38,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/draw/draw_context.h"
-#include "pipe/tgsi/util/tgsi_dump.h"
+#include "draw/draw_context.h"
+#include "tgsi/util/tgsi_dump.h"
 
 #include "st_context.h"
 #include "st_cache.h"
-- 
cgit v1.2.3


From 7976a084e792daf0b23c688bfa8f577de141ecca Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 21 Feb 2008 11:01:35 -0800
Subject: Cell: Use multiple DMA tags for the dcache.

---
 src/gallium/drivers/cell/spu/spu_dcache.c | 2 +-
 src/gallium/drivers/cell/spu/spu_main.h   | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 698a5790bb..3baeaea998 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -33,7 +33,7 @@
 #define CACHE_NAME            data
 #define CACHED_TYPE           qword
 #define CACHE_TYPE            CACHE_TYPE_RO
-#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER
+#define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
 #define CACHE_LOG2NNWAY       2
 #define CACHE_LOG2NSETS       6
 #include <cache-api.h>
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 5c95d112ac..d14f1abbe7 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -131,7 +131,10 @@ extern boolean Debug;
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
 #define TAG_TEXTURE_TILE      19
-#define TAG_INSTRUCTION_FETCH 20
+#define TAG_DCACHE0           20
+#define TAG_DCACHE1           21
+#define TAG_DCACHE2           22
+#define TAG_DCACHE3           23
 
 
-- 
cgit v1.2.3


From 1c50ea2cd9ab8752793c99b4a7a2a6656bdde1ac Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 27 Feb 2008 13:40:23 -0800
Subject: cell: Use unified data cache for textures too

---
 src/gallium/drivers/cell/spu/spu_main.c    |   2 +
 src/gallium/drivers/cell/spu/spu_main.h    |   3 +-
 src/gallium/drivers/cell/spu/spu_texture.c | 184 +++++++++++------------------
 3 files changed, 72 insertions(+), 117 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index cc4bafdb3a..59300028d4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -286,6 +286,8 @@ cmd_state_texture(const struct cell_command_texture *texture)
       { spu.texture.width, spu.texture.height, 0.0, 0.0};
    spu.tex_size_mask = (vector unsigned int)
       { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
+   spu.tex_size_x_mask = spu_splats(spu.texture.width - 1);
+   spu.tex_size_y_mask = spu_splats(spu.texture.height - 1);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index d14f1abbe7..a13edd1702 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,6 +107,8 @@ struct spu_global
 
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
 
    vector float (*sample_texture)(vector float texcoord);
 
@@ -130,7 +132,6 @@ extern boolean Debug;
 #define TAG_INDEX_BUFFER      16
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
-#define TAG_TEXTURE_TILE      19
 #define TAG_DCACHE0           20
 #define TAG_DCACHE1           21
 #define TAG_DCACHE2           22
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 3962aaa4a9..67eb08196a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -31,19 +31,7 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_colorpack.h"
-
-
-/**
- * Number of texture tiles to cache.
- * Note that this will probably be the largest consumer of SPU local store/
- * memory for this driver!
- */
-#define CACHE_SIZE 16
-
-static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
-
-static vector unsigned int tex_tile_xy[CACHE_SIZE];
-
+#include "spu_dcache.h"
 
 
 /**
@@ -52,78 +40,60 @@ static vector unsigned int tex_tile_xy[CACHE_SIZE];
 void
 invalidate_tex_cache(void)
 {
-   /* XXX memset? */
-   uint i;
-   for (i = 0; i < CACHE_SIZE; i++) {
-      tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U });
-   }
+   spu_dcache_mark_dirty((unsigned) spu.texture.start,
+                         4 * spu.texture.width * spu.texture.height);
 }
 
 
-/**
- * Return the cache pos/index which corresponds to tile (tx,ty)
- */
-static INLINE uint
-cache_pos(vector unsigned int txty)
+static uint
+get_texel(vec_uint4 coordinate)
 {
-   uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE;
-   return pos;
+   vec_uint4 tmp;
+   unsigned x = spu_extract(coordinate, 0);
+   unsigned y = spu_extract(coordinate, 1);
+   const unsigned tiles_per_row = spu.texture.width / TILE_SIZE;
+   unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
+                                            + (x / TILE_SIZE));
+   unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
+                                + (x % TILE_SIZE));
+
+   spu_dcache_fetch_unaligned((qword *) & tmp,
+                              spu.texture.start + tile_offset + texel_offset,
+                              4);
+   return spu_extract(tmp, 0);
 }
 
 
-/**
- * Make sure the tile for texel (i,j) is present, return its position/index
- * in the cache.
- */
-static uint
-get_tex_tile(vector unsigned int ij)
+static void
+get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
-   /* tile address: tx,ty */
-   const vector unsigned int txty = spu_rlmask(ij, -5);  /* divide by 32 */
-   const uint pos = cache_pos(txty);
-
-   if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) ||
-       (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) {
-
-      /* texture cache miss, fetch tile from main memory */
-      const uint tiles_per_row = spu.texture.width / TILE_SIZE;
-      const uint bytes_per_tile = sizeof(tile_t);
-      const void *src = (const ubyte *) spu.texture.start
-         + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile;
-
-      printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
-             spu.init.id,
-             spu_extract(txty,0),
-             spu_extract(txty,1),
-             pos,
-             spu_extract(tex_tile_xy[pos],0),
-             spu_extract(tex_tile_xy[pos],1));
-
-      ASSERT_ALIGN16(tex_tiles[pos].ui);
-      ASSERT_ALIGN16(src);
-
-      mfc_get(tex_tiles[pos].ui,  /* dest */
-              (unsigned int) src,
-              bytes_per_tile,      /* size */
-              TAG_TEXTURE_TILE,
-              0, /* tid */
-              0  /* rid */);
-
-      wait_on_mask(1 << TAG_TEXTURE_TILE);
-
-      tex_tile_xy[pos] = txty;
-   }
-   else {
-#if 0
-      printf("SPU %u: tex cache HIT at %d, %d\n",
-             spu.init.id, tx, ty);
-#endif
-   }
-
-   return pos;
+   const unsigned texture_ea = (uintptr_t) spu.texture.start;
+   vec_uint4 tile_x = spu_rlmask(x, -5);
+   vec_uint4 tile_y = spu_rlmask(y, -5);
+   const qword offset_x = si_andi((qword) x, 0x1f);
+   const qword offset_y = si_andi((qword) y, 0x1f);
+
+   const qword tiles_per_row = (qword) spu_splats(spu.texture.width / TILE_SIZE);
+   const qword tile_size = (qword) spu_splats(sizeof(tile_t));
+
+   qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
+   tile_offset = si_mpy((qword) tile_offset, tile_size);
+
+   qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
+   texel_offset = si_mpyui(texel_offset, 4);
+   
+   vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+   
+   spu_dcache_fetch_unaligned((qword *) & texels[0],
+                              texture_ea + spu_extract(offset, 0), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[1],
+                              texture_ea + spu_extract(offset, 1), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[2],
+                              texture_ea + spu_extract(offset, 2), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[3],
+                              texture_ea + spu_extract(offset, 3), 4);
 }
 
-
 /**
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
@@ -134,9 +104,7 @@ sample_texture_nearest(vector float texcoord)
    vector float tc = spu_mul(texcoord, spu.tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
    itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
-   uint pos = get_tex_tile(itc);
-   uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
+   uint texel = get_texel(itc);
    return spu_unpack_A8R8G8B8(texel);
 }
 
@@ -144,49 +112,33 @@ sample_texture_nearest(vector float texcoord)
 vector float
 sample_texture_bilinear(vector float texcoord)
 {
-   static const vector unsigned int offset10 = {1, 0, 0, 0};
-   static const vector unsigned int offset01 = {0, 1, 0, 0};
+   static const vec_uint4 offset_x = {0, 0, 1, 1};
+   static const vec_uint4 offset_y = {0, 1, 0, 1};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
    tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
 
    /* integer texcoords S,T: */
-   vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
-   vector unsigned int itc01 = spu_add(itc00, offset01);
-   vector unsigned int itc10 = spu_add(itc00, offset10);
-   vector unsigned int itc11 = spu_add(itc10, offset01);
-
-   /* mask (GL_REPEAT) */
-   itc00 = spu_and(itc00, spu.tex_size_mask);
-   itc01 = spu_and(itc01, spu.tex_size_mask);
-   itc10 = spu_and(itc10, spu.tex_size_mask);
-   itc11 = spu_and(itc11, spu.tex_size_mask);
-
-   /* intra tile addr */
-   vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
-   vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1);
-   vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
-   vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
-
-   /* get tile cache positions */
-   uint pos00 = get_tex_tile(itc00);
-   uint pos01, pos10, pos11;
-   if ((spu_extract(ij00, 0) < TILE_SIZE-1) &&
-       (spu_extract(ij00, 1) < TILE_SIZE-1)) {
-      /* all texels are in the same tile */
-      pos01 = pos10 = pos11 = pos00;
-   }
-   else {
-      pos01 = get_tex_tile(itc01);
-      pos10 = get_tex_tile(itc10);
-      pos11 = get_tex_tile(itc11);
-   }
-
-   /* get texels from tiles and convert to float[4] */
-   vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
-   vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
-   vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-   vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
+
+   vec_uint4 texels[4];
+   
+   vec_uint4 x = spu_splats(spu_extract(itc, 0));
+   vec_uint4 y = spu_splats(spu_extract(itc, 1));
+
+   x = spu_add(x, offset_x);
+   y = spu_add(y, offset_y);
+
+   x = spu_and(x, spu.tex_size_x_mask);
+   y = spu_and(y, spu.tex_size_y_mask);
+
+   get_four_texels(x, y, texels);
+
+   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
+   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
+   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
+   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
-- 
cgit v1.2.3


From 1936e4bdfd776f78f9fe44f77ce66066fd166360 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 17 Mar 2008 15:45:52 -0700
Subject: cell: Initial code-gen for alpha / stencil / depth testing

Alpha test is currently broken because all per-fragment testing occurs
before alpha is calculated.

Stencil test is currently broken because the Z-clear code asserts if
there is a stencil buffer.
---
 src/gallium/drivers/cell/common.h                  |   10 +
 src/gallium/drivers/cell/ppu/Makefile              |    1 +
 src/gallium/drivers/cell/ppu/cell_context.h        |   25 +-
 src/gallium/drivers/cell/ppu/cell_pipe_state.c     |   37 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |   24 +-
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 1020 ++++++++++++++++++++
 .../drivers/cell/ppu/cell_state_per_fragment.h     |   35 +
 src/gallium/drivers/cell/spu/Makefile              |    1 +
 src/gallium/drivers/cell/spu/spu_main.c            |   25 +-
 src/gallium/drivers/cell/spu/spu_main.h            |   16 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |  191 ++++
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   32 +
 src/gallium/drivers/cell/spu/spu_render.c          |    4 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |   23 +-
 src/gallium/drivers/cell/spu/spu_ztest.h           |  135 ---
 15 files changed, 1409 insertions(+), 170 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
 create mode 100644 src/gallium/drivers/cell/spu/spu_per_fragment_op.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_per_fragment_op.h
 delete mode 100644 src/gallium/drivers/cell/spu/spu_ztest.h

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 9a4004535e..fe93fd8e1a 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -104,6 +104,16 @@
 
 
+/**
+ */
+struct cell_command_depth_stencil_alpha_test {
+   uint64_t base;               /**< Effective address of code start. */
+   unsigned size;               /**< Size in bytes of test code. */
+   unsigned read_depth;         /**< Flag: should depth be read? */
+   unsigned read_stencil;       /**< Flag: should stencil be read? */
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index d38fa6ce07..0389a9554c 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -27,6 +27,7 @@ SOURCES = \
 	cell_flush.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
+	cell_state_per_fragment.c \
 	cell_state_shader.c \
 	cell_pipe_state.c \
 	cell_screen.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index b221424323..9e79db0ace 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -57,16 +57,37 @@ struct cell_fragment_shader_state
 };
 
 
+struct cell_blend_state {
+   struct pipe_blend_state base;
+
+   /**
+    * Generated code to perform alpha blending
+    */
+   struct spe_function code;
+};
+
+
+struct cell_depth_stencil_alpha_state {
+   struct pipe_depth_stencil_alpha_state   base;
+
+   /**
+    * Generated code to perform alpha, stencil, and depth testing on the SPE
+    */
+   struct spe_function code;
+
+};
+
+
 struct cell_context
 {
    struct pipe_context pipe;
 
    struct cell_winsys *winsys;
 
-   const struct pipe_blend_state *blend;
+   const struct cell_blend_state *blend;
    const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
    uint num_samplers;
-   const struct pipe_depth_stencil_alpha_state   *depth_stencil;
+   const struct cell_depth_stencil_alpha_state   *depth_stencil;
    const struct pipe_rasterizer_state *rasterizer;
    const struct cell_vertex_shader_state *vs;
    const struct cell_fragment_shader_state *fs;
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 025ed3bbbf..66ede99d13 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -36,6 +36,7 @@
 #include "cell_context.h"
 #include "cell_state.h"
 #include "cell_texture.h"
+#include "cell_state_per_fragment.h"
 
 
@@ -43,7 +44,12 @@ static void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   return mem_dup(blend, sizeof(*blend));
+   struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state));
+
+   (void) memcpy(cb, blend, sizeof(*blend));
+   cb->code.store = NULL;
+
+   return cb;
 }
 
 
@@ -54,7 +60,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 
    draw_flush(cell->draw);
 
-   cell->blend = (const struct pipe_blend_state *)blend;
+   cell->blend = (const struct cell_blend_state *)blend;
 
    cell->dirty |= CELL_NEW_BLEND;
 }
@@ -63,7 +69,10 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 static void
 cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
-   FREE(blend);
+   struct cell_blend_state *cb = (struct cell_blend_state *) blend;
+   
+   spe_release_func(& cb->code);
+   FREE(cb);
 }
 
 
@@ -87,7 +96,13 @@ static void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
                  const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   return mem_dup(depth_stencil, sizeof(*depth_stencil));
+   struct cell_depth_stencil_alpha_state *cdsa =
+       MALLOC(sizeof(struct cell_depth_stencil_alpha_state));
+
+   (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil));
+   cdsa->code.store = NULL;
+
+   return cdsa;
 }
 
 
@@ -96,12 +111,16 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
                                     void *depth_stencil)
 {
    struct cell_context *cell = cell_context(pipe);
+   struct cell_depth_stencil_alpha_state *cdsa =
+       (struct cell_depth_stencil_alpha_state *) depth_stencil;
 
    draw_flush(cell->draw);
 
-   cell->depth_stencil
-      = (const struct pipe_depth_stencil_alpha_state *) depth_stencil;
+   if (cdsa->code.store == NULL) {
+      cell_generate_depth_stencil_test(cdsa);
+   }
 
+   cell->depth_stencil = cdsa;
    cell->dirty |= CELL_NEW_DEPTH_STENCIL;
 }
 
@@ -109,7 +128,11 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
 static void
 cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
 {
-   FREE(depth);
+   struct cell_depth_stencil_alpha_state *cdsa =
+       (struct cell_depth_stencil_alpha_state *) depth;
+
+   spe_release_func(& cdsa->code);
+   FREE(cdsa);
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 670eb26bdd..c8d5fdf709 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -71,9 +71,27 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
-                     cell->depth_stencil,
-                     sizeof(struct pipe_depth_stencil_alpha_state));
+      struct cell_command_depth_stencil_alpha_test dsat;
+      
+
+      dsat.base = (intptr_t) cell->depth_stencil->code.store;
+      dsat.size = (char *) cell->depth_stencil->code.csr
+	  - (char *) cell->depth_stencil->code.store;
+      dsat.read_depth = TRUE;
+      dsat.read_stencil = FALSE;
+
+      {
+	 uint32_t *p = cell->depth_stencil->code.store;
+
+	 printf("\t.text\n");
+	 for (/* empty */; p < cell->depth_stencil->code.csr; p++) {
+	    printf("\t.long\t0x%04x\n", *p);
+	 }
+	 fflush(stdout);
+      }
+
+      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat,
+		     sizeof(dsat));
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
new file mode 100644
index 0000000000..60b64438d8
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -0,0 +1,1020 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Generate code to perform all per-fragment operations.
+ *
+ * Code generated by these functions perform both alpha, depth, and stencil
+ * testing as well as alpha blending.
+ *
+ * \note
+ * Occlusion query is not supported, but this is the right place to add that
+ * support.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "cell_context.h"
+
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Generate code to perform alpha testing.
+ *
+ * The code generated by this function uses the register specificed by
+ * \c mask as both an input and an output.
+ *
+ * \param dsa    Current alpha-test state
+ * \param f      Function to which code should be appended
+ * \param mask   Index of register containing active fragment mask
+ * \param alphas Index of register containing per-fragment alpha values
+ *
+ * \note Emits a maximum of 6 instructions.
+ */
+static void
+emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int alphas)
+{
+   /* If the alpha function is either NEVER or ALWAYS, there is no need to
+    * load the reference value into a register.  ALWAYS is a fairly common
+    * case, and this optimization saves 2 instructions.
+    */
+   if (dsa->alpha.enabled
+       && (dsa->alpha.func != PIPE_FUNC_NEVER)
+       && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      int ref = spe_allocate_available_register(f);
+      int tmp_a = spe_allocate_available_register(f);
+      int tmp_b = spe_allocate_available_register(f);
+      union {
+	 float f;
+	 unsigned u;
+      } ref_val;
+      boolean complement = FALSE;
+
+      ref_val.f = dsa->alpha.ref;
+
+      spe_il(f, ref, ref_val.u & 0x0000ffff);
+      spe_ilh(f, ref, ref_val.u >> 16);
+
+      switch (dsa->alpha.func) {
+      case PIPE_FUNC_NOTEQUAL:
+	 complement = TRUE;
+	 /* FALLTHROUGH */
+
+      case PIPE_FUNC_EQUAL:
+	 spe_fceq(f, tmp_a, ref, alphas);
+	 break;
+
+      case PIPE_FUNC_LEQUAL:
+	 complement = TRUE;
+	 /* FALLTHROUGH */
+
+      case PIPE_FUNC_GREATER:
+	 spe_fcgt(f, tmp_a, ref, alphas);
+	 break;
+
+      case PIPE_FUNC_LESS:
+	 complement = TRUE;
+	 /* FALLTHROUGH */
+
+      case PIPE_FUNC_GEQUAL:
+	 spe_fcgt(f, tmp_a, ref, alphas);
+	 spe_fceq(f, tmp_b, ref, alphas);
+	 spe_or(f, tmp_a, tmp_b, tmp_a);
+	 break;
+
+      case PIPE_FUNC_ALWAYS:
+      case PIPE_FUNC_NEVER:
+      default:
+	 assert(0);
+	 break;
+      }
+
+      if (complement) {
+	 spe_andc(f, mask, mask, tmp_a);
+      } else {
+	 spe_and(f, mask, mask, tmp_a);
+      }
+
+      spe_release_register(f, ref);
+      spe_release_register(f, tmp_a);
+      spe_release_register(f, tmp_b);
+   } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) {
+      spe_il(f, mask, 0);
+   }
+}
+
+
+/**
+ * \param dsa        Current depth-test state
+ * \param f          Function to which code should be appended
+ * \param m          Mask of allocated / free SPE registers
+ * \param mask       Index of register to contain depth-pass mask
+ * \param stored     Index of register containing values from depth buffer
+ * \param calculated Index of register containing per-fragment depth values
+ *
+ * \return
+ * If the calculated depth comparison mask is the actual mask, \c FALSE is
+ * returned.  If the calculated depth comparison mask is the compliment of
+ * the actual mask, \c TRUE is returned.
+ *
+ * \note Emits a maximum of 3 instructions.
+ */
+static boolean
+emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int stored, int calculated)
+{
+   unsigned func = (dsa->depth.enabled)
+       ? dsa->depth.func : PIPE_FUNC_ALWAYS;
+   int tmp = spe_allocate_available_register(f);
+   boolean compliment = FALSE;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask, 0);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      spe_ceq(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      spe_clgt(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LESS:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      spe_clgt(f, mask, calculated, stored);
+      spe_ceq(f, tmp, calculated, stored);
+      spe_or(f, mask, mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      spe_il(f, mask, ~0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   spe_release_register(f, tmp);
+   return compliment;
+}
+
+
+/**
+ * \note Emits a maximum of 5 instructions.
+ */
+static void
+emit_stencil_op(struct spe_function *f,
+                int out, int in, int mask, unsigned op, unsigned ref)
+{
+   const int clamp = spe_allocate_available_register(f);
+   const int tmp = spe_allocate_available_register(f);
+
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      assert(0);
+   case PIPE_STENCIL_OP_ZERO:
+      spe_il(f, out, 0);
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      spe_il(f, out, ref);
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      spe_il(f, clamp, 0x0ff);
+      spe_ai(f, out, in, 1);
+      spe_cgti(f, tmp, out, clamp);
+      spe_selb(f, out, out, clamp, tmp);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      spe_il(f, clamp, 0);
+      spe_ai(f, out, in, -1);
+      spe_cgti(f, tmp, out, clamp);
+      spe_selb(f, out, clamp, out, tmp);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      spe_ai(f, out, in, 1);
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      spe_ai(f, out, in, -1);
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      spe_nor(f, out, in, in);
+      break;
+   default:
+      assert(0);
+   }
+
+   spe_release_register(f, tmp);
+   spe_release_register(f, clamp);
+
+   spe_selb(f, out, in, out, mask);
+}
+
+
+/**
+ * \param dsa        Depth / stencil test state
+ * \param face       0 for front face, 1 for back face
+ * \param f          Function to append instructions to
+ * \param reg_mask   Mask of allocated registers
+ * \param mask       Register containing mask of fragments passing the
+ *                   alpha test
+ * \param depth_mask Register containing mask of fragments passing the
+ *                   depth test
+ * \param depth_compliment  Is \c depth_mask the compliment of the actual mask?
+ * \param stencil    Register containing values from stencil buffer
+ * \param depth_pass Register to store mask of fragments passing stencil test
+ *                   and depth test
+ * 
+ * \note
+ * Emits a maximum of 10 + (3 * 5) = 25 instructions.
+ */
+static int
+emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
+                  unsigned face, 
+                  struct spe_function *f,
+                  int mask,
+                  int depth_mask,
+                  boolean depth_complement,
+                  int stencil,
+                  int depth_pass)
+{
+   int stencil_fail = spe_allocate_available_register(f);
+   int depth_fail = spe_allocate_available_register(f);
+   int stencil_mask = spe_allocate_available_register(f);
+   int stencil_pass = spe_allocate_available_register(f);
+   int face_stencil = spe_allocate_available_register(f);
+   int stencil_src = stencil;
+   const unsigned ref = (dsa->stencil[face].ref_value
+                         & dsa->stencil[face].value_mask);
+   boolean complement = FALSE;
+   int stored = spe_allocate_available_register(f);
+   int tmp = spe_allocate_available_register(f);
+
+
+   if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+       && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+       && (dsa->stencil[face].value_mask != 0x0ff)) {
+      spe_andi(f, stored, stencil, dsa->stencil[face].value_mask);
+   }
+
+
+   switch (dsa->stencil[face].func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, stencil_mask, 0);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      spe_ceqi(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      spe_clgti(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LESS:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      spe_clgti(f, stencil_mask, stored, ref);
+      spe_ceqi(f, tmp, stored, ref);
+      spe_or(f, stencil_mask, stencil_mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* See comment below. */
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   spe_release_register(f, stored);
+   spe_release_register(f, tmp);
+
+
+   /* ALWAYS is a very common stencil-test, so some effort is applied to
+    * optimize that case.  The stencil-pass mask is the same as the input
+    * fragment mask.  This makes the stencil-test (above) a no-op, and the
+    * input fragment mask can be "renamed" the stencil-pass mask.
+    */
+   if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) {
+      spe_release_register(f, stencil_pass);
+      stencil_pass = mask;
+   } else {
+      if (complement) {
+         spe_andc(f, stencil_pass, mask, stencil_mask);
+      } else {
+         spe_and(f, stencil_pass, mask, stencil_mask);
+      }
+   }
+
+   if (depth_complement) {
+      spe_andc(f, depth_pass, stencil_pass, depth_mask);
+   } else {
+      spe_and(f, depth_pass, stencil_pass, depth_mask);
+   }
+
+
+   /* Conditionally emit code to update the stencil value under various
+    * condititons.  Note that there is no need to generate code under the
+    * following circumstances:
+    * 
+    * - Stencil write mask is zero.
+    * - For stencil-fail if the stencil test is ALWAYS
+    * - For depth-fail if the stencil test is NEVER
+    * - For depth-pass if the stencil test is NEVER
+    * - Any of the 3 conditions if the operation is KEEP
+    */
+   if (dsa->stencil[face].write_mask != 0) {
+      if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+          && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (complement) {
+            spe_and(f, stencil_fail, mask, stencil_mask);
+         } else {
+            spe_andc(f, stencil_fail, mask, stencil_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, stencil_fail,
+                         dsa->stencil[face].fail_op,
+                         dsa->stencil[face].ref_value);
+
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (depth_complement) {
+            spe_and(f, depth_fail, stencil_pass, depth_mask);
+         } else {
+            spe_andc(f, depth_fail, stencil_pass, depth_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, depth_fail,
+                         dsa->stencil[face].zfail_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) {
+         emit_stencil_op(f, face_stencil, stencil_src, depth_pass,
+                         dsa->stencil[face].zpass_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+   }
+
+   spe_release_register(f, stencil_fail);
+   spe_release_register(f, depth_fail);
+   spe_release_register(f, stencil_mask);
+   if (stencil_pass != mask) {
+      spe_release_register(f, stencil_pass);
+   }
+
+   /* If all of the stencil operations were KEEP or the stencil write mask was
+    * zero, "stencil_src" will still be set to "stencil".  In this case
+    * release the "face_stencil" register.  Otherwise apply the stencil write
+    * mask to select bits from the calculated stencil value and the previous
+    * stencil value.
+    */
+   if (stencil_src == stencil) {
+      spe_release_register(f, face_stencil);
+   } else if (dsa->stencil[face].write_mask != 0x0ff) {
+      int tmp = spe_allocate_available_register(f);
+      
+      spe_il(f, tmp, dsa->stencil[face].write_mask);
+      spe_selb(f, stencil_src, stencil, stencil_src, tmp);
+
+      spe_release_register(f, tmp);
+   }
+
+   return stencil_src;
+}
+
+
+void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
+{
+   struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base;
+   struct spe_function *const f = &cdsa->code;
+
+   /* This code generates a maximum of 6 (alpha test) + 3 (depth test)
+    * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
+    * up to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, 4 * 64);
+
+
+   /* Allocate registers for the function's input parameters.  Cleverly (and
+    * clever code is usually dangerous, but I couldn't resist) the generated
+    * function returns a structure.  Returned structures start with register
+    * 3, and the structure fields are ordered to match up exactly with the
+    * input parameters.
+    */
+   int mask = spe_allocate_register(f, 3);
+   int depth = spe_allocate_register(f, 4);
+   int stencil = spe_allocate_register(f, 5);
+   int zvals = spe_allocate_register(f, 6);
+   int frag_a = spe_allocate_register(f, 7);
+   int facing = spe_allocate_register(f, 8);
+
+   int depth_mask = spe_allocate_available_register(f);
+
+   boolean depth_complement;
+
+
+   emit_alpha_test(dsa, f, mask, frag_a);
+
+   depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals);
+
+   if (dsa->stencil[0].enabled) {
+      const int front_depth_pass = spe_allocate_available_register(f);
+      int front_stencil = emit_stencil_test(dsa, 0, f, mask,
+                                            depth_mask, depth_complement,
+                                            stencil, front_depth_pass);
+
+      if (dsa->stencil[1].enabled) {
+         const int back_depth_pass = spe_allocate_available_register(f);
+         int back_stencil = emit_stencil_test(dsa, 1, f, mask,
+                                              depth_mask,  depth_complement,
+                                              stencil, back_depth_pass);
+
+         /* If the front facing stencil value and the back facing stencil
+          * value are stored in the same register, there is no need to select
+          * a value based on the facing.  This can happen if the stencil value
+          * was not modified due to the write masks being zero, the stencil
+          * operations being KEEP, etc.
+          */
+         if (front_stencil != back_stencil) {
+            spe_selb(f, stencil, back_stencil, front_stencil, facing);
+         }
+         
+         if (back_stencil != stencil) { 
+            spe_release_register(f, back_stencil);
+         }
+
+         if (front_stencil != stencil) { 
+            spe_release_register(f, front_stencil);
+         }
+
+         spe_selb(f, mask, back_depth_pass, front_depth_pass, facing);
+
+         spe_release_register(f, back_depth_pass);
+      } else {
+         if (front_stencil != stencil) { 
+            spe_or(f, stencil, front_stencil, front_stencil);
+            spe_release_register(f, front_stencil);
+         }
+      }
+
+      spe_release_register(f, front_depth_pass);
+   } else if (dsa->depth.enabled) {
+      if (depth_complement) {
+         spe_andc(f, mask, mask, depth_mask);
+      } else {
+         spe_and(f, mask, mask, depth_mask);
+      }
+   }
+
+   if (dsa->depth.writemask) {
+         spe_selb(f, depth, depth, zvals, mask);
+   }
+
+   spe_bi(f, 0, 0, 0);
+}
+
+
+/**
+ * \note Emits a maximum of 3 instructions
+ */
+static int
+emit_alpha_factor_calculation(struct spe_function *f,
+                              unsigned factor, float const_alpha,
+                              int src_alpha, int dst_alpha)
+{
+   union {
+      float f;
+      unsigned u;
+   } alpha;
+   int factor_reg;
+   int tmp;
+
+
+   alpha.f = const_alpha;
+
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_or(f, factor_reg, src_alpha, src_alpha);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor_reg = dst_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      const_alpha = 1.0 - const_alpha;
+      /* FALLTHROUGH */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, factor_reg, alpha.u & 0x0ffff);
+      spe_ilh(f, factor_reg, alpha.u >> 16);
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, src_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, dst_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+      factor_reg = -1;
+      break;
+   }
+
+   return factor_reg;
+}
+
+
+/**
+ * \note Emits a maximum of 5 instructions
+ */
+static void
+emit_color_factor_calculation(struct spe_function *f,
+                              unsigned sF, unsigned mask,
+			      const struct pipe_blend_color *blend_color,
+                              const int *src,
+                              const int *dst,
+                              int *factor)
+{
+   union {
+      float f[4];
+      unsigned u[4];
+   } color;
+   int tmp;
+   unsigned i;
+
+
+   color.f[0] = blend_color->color[0];
+   color.f[1] = blend_color->color[1];
+   color.f[2] = blend_color->color[2];
+   color.f[3] = blend_color->color[3];
+
+   factor[0] = -1;
+   factor[1] = -1;
+   factor[2] = -1;
+   factor[3] = -1;
+
+   switch (sF) {
+   case PIPE_BLENDFACTOR_ONE:
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_or(f, factor[i], src[i], src[i]);
+         }
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_or(f, factor[0], src[3], src[3]);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor[0] = dst[3];
+      factor[1] = dst[3];
+      factor[2] = dst[3];
+      break;
+
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      factor[0] = dst[0];
+      factor[1] = dst[1];
+      factor[2] = dst[2];
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      /* Alpha saturate means min(As, 1-Ad).
+       */
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, tmp, tmp, dst[3]);
+      spe_fcgt(f, factor[0], tmp, src[3]);
+      spe_selb(f, factor[0], src[3], tmp, factor[0]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      color.f[0] = 1.0 - color.f[0];
+      color.f[1] = 1.0 - color.f[1];
+      color.f[2] = 1.0 - color.f[2];
+      /* FALLTHROUGH */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      for (i = 0; i < 3; i++) {
+	 factor[i] = spe_allocate_available_register(f);
+
+	 spe_il(f, factor[i], color.u[i] & 0x0ffff);
+	 spe_ilh(f, factor[i], color.u[i] >> 16);
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      color.f[3] = 1.0 - color.f[3];
+      /* FALLTHROUGH */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, factor[0], color.u[3] & 0x0ffff);
+      spe_ilh(f, factor[0], color.u[3] >> 16);
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, src[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, src[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, dst[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, dst[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+emit_blend_calculation(struct spe_function *f,
+                       unsigned func, unsigned sF, unsigned dF,
+                       int src, int src_factor, int dst, int dst_factor)
+{
+   int tmp = spe_allocate_available_register(f);
+
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fa(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         }
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fma(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, dst);
+         }
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fms(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, src);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, dst, src);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         }
+      } else {
+         spe_fm(f, tmp, src, src_factor);
+         spe_fms(f, src, src, dst_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_MIN:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, dst, src, tmp);
+      break;
+
+   case PIPE_BLEND_MAX:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, src, dst, tmp);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   spe_release_register(f, tmp);
+}
+
+
+/**
+ * Generate code to perform alpha blending on the SPE
+ */
+void
+cell_generate_alpha_blend(struct cell_blend_state *cb,
+			  const struct pipe_blend_color *blend_color)
+{
+   struct pipe_blend_state *const b = &cb->base;
+   struct spe_function *const f = &cb->code;
+
+   /* This code generates a maximum of 3 (source alpha factor)
+    * + 3 (destination alpha factor) + (3 * 5) (source color factor)
+    * + (3 * 5) (destination color factor) + (4 * 2) (blend equation)
+    * + 4 (fragment mask) + 1 (return) = 49 instlructions.  Round up to 64 to
+    * make it a happy power-of-two.
+    */
+   spe_init_func(f, 4 * 64);
+
+
+   const int frag[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+   const int pixel[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+   const int mask = spe_allocate_register(f, 11);
+   unsigned func[4];
+   unsigned sF[4];
+   unsigned dF[4];
+   unsigned i;
+   int src_factor[4];
+   int dst_factor[4];
+
+
+   /* Does the selected blend mode make use of the source / destination
+    * color (RGB) blend factors?
+    */
+   boolean need_color_factor = b->blend_enable
+       && (b->rgb_func != PIPE_BLEND_MIN)
+       && (b->rgb_func != PIPE_BLEND_MAX);
+
+   /* Does the selected blend mode make use of the source / destination
+    * alpha blend factors?
+    */
+   boolean need_alpha_factor = b->blend_enable
+       && (b->alpha_func != PIPE_BLEND_MIN)
+       && (b->alpha_func != PIPE_BLEND_MAX);
+
+
+   sF[0] = b->rgb_src_factor;
+   sF[1] = sF[0];
+   sF[2] = sF[0];
+   sF[3] = (b->alpha_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)
+       ? PIPE_BLENDFACTOR_ONE : b->alpha_src_factor;
+
+   dF[0] = b->rgb_dst_factor;
+   dF[1] = dF[0];
+   dF[2] = dF[0];
+   dF[3] = b->rgb_dst_factor;
+
+
+   /* If alpha writing is enabled and the alpha blend mode requires use of
+    * the alpha factor, calculate the alpha factor.
+    */
+   if (((b->colormask & 8) != 0) && need_alpha_factor) {
+      src_factor[3] = emit_alpha_factor_calculation(f, sF[3],
+						    blend_color->color[3],
+                                                    frag[3], pixel[3]);
+
+      /* If the alpha destination blend factor is the same as the alpha source
+       * blend factor, re-use the previously calculated value.
+       */
+      dst_factor[3] = (dF[3] == sF[3])
+          ? src_factor[3]
+          : emit_alpha_factor_calculation(f, dF[3],
+					  blend_color->color[3],
+                                          frag[3], pixel[3]);
+   }
+   
+
+   if (sF[0] == sF[3]) {
+      src_factor[0] = src_factor[3];
+      src_factor[1] = src_factor[3];
+      src_factor[2] = src_factor[3];
+   } else if (sF[0] == dF[3]) {
+      src_factor[0] = dst_factor[3];
+      src_factor[1] = dst_factor[3];
+      src_factor[2] = dst_factor[3];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_src_factor,
+                                    b->colormask,
+                                    blend_color,
+                                    frag, pixel, src_factor);
+   }
+
+
+   if (dF[0] == sF[3]) {
+      dst_factor[0] = src_factor[3];
+      dst_factor[1] = src_factor[3];
+      dst_factor[2] = src_factor[3];
+   } else if (dF[0] == dF[3]) {
+      dst_factor[0] = dst_factor[3];
+      dst_factor[1] = dst_factor[3];
+      dst_factor[2] = dst_factor[3];
+   } else if (dF[0] == sF[0]) {
+      dst_factor[0] = src_factor[0];
+      dst_factor[1] = src_factor[1];
+      dst_factor[2] = src_factor[2];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_dst_factor,
+                                    b->colormask,
+                                    blend_color,
+                                    frag, pixel, dst_factor);
+   }
+
+   
+
+   func[0] = b->rgb_func;
+   func[1] = func[0];
+   func[2] = func[0];
+   func[3] = b->alpha_func;
+
+   for (i = 0; i < 4; ++i) {
+      if ((b->colormask & (1U << i)) != 0) {
+         emit_blend_calculation(f,
+                                func[i], sF[i], dF[i],
+                                frag[i], src_factor[i],
+                                pixel[i], dst_factor[i]);
+         spe_selb(f, frag[i], pixel[i], frag[i], mask);
+      } else {
+         spe_or(f, frag[i], pixel[i], pixel[i]);
+      }
+   }
+
+   spe_bi(f, 0, 0, 0);
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
new file mode 100644
index 0000000000..541c3b3be0
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -0,0 +1,35 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CELL_STATE_PER_FRAGMENT_H
+#define CELL_STATE_PER_FRAGMENT_H
+
+extern void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
+
+extern void
+cell_generate_alpha_blend(struct cell_blend_state *cb,
+			  const struct pipe_blend_color *blend_color);
+
+#endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index c071de1900..115ca8cd90 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -19,6 +19,7 @@ SOURCES = \
 	spu_main.c \
 	spu_blend.c \
 	spu_dcache.c \
+	spu_per_fragment_op.c \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 59300028d4..8e46f6e471 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,6 +58,9 @@ struct spu_vs_context draw;
 static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX]
     ALIGN16_ATTRIB;
 
+static unsigned char depth_stencil_code_buffer[4 * 64]
+    ALIGN16_ATTRIB;
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -248,14 +251,26 @@ cmd_state_blend(const struct pipe_blend_state *state)
 
 
 static void
-cmd_state_depth_stencil(const struct pipe_depth_stencil_alpha_state *state)
+cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state)
 {
    if (Debug)
       printf("SPU %u: DEPTH_STENCIL: ztest %d\n",
              spu.init.id,
-             state->depth.enabled);
+             state->read_depth);
+
+   ASSERT_ALIGN16(state->base);
+
+   mfc_get(depth_stencil_code_buffer,
+	   (unsigned int) state->base,  /* src */
+	   ROUNDUP16(state->size),
+	   TAG_BATCH_BUFFER,
+	   0, /* tid */
+	   0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
 
-   memcpy(&spu.depth_stencil, state, sizeof(*state));
+   spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
+   spu.read_depth = state->read_depth;
+   spu.read_stencil = state->read_stencil;
 }
 
 
@@ -415,9 +430,9 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
          break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
-         cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
+         cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
                                  &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index a13edd1702..444e218645 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -56,6 +56,17 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
+struct spu_frag_test_results {
+   qword mask;
+   qword depth;
+   qword stencil;
+};
+
+typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
+    qword pixel_depth, qword pixel_stencil, qword frag_depth,
+    qword frag_alpha, qword facing);
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -79,8 +90,9 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
-   struct pipe_blend_state blend_stencil;
-   struct pipe_depth_stencil_alpha_state depth_stencil;
+   boolean read_depth;
+   boolean read_stencil;
+   frag_test_func frag_test;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
new file mode 100644
index 0000000000..d42b522b41
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -0,0 +1,191 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file spu_per_fragment_op.c
+ * SPU implementation various per-fragment operations.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_format.h"
+#include "spu_main.h"
+#include "spu_per_fragment_op.h"
+
+#define ZERO 0x80
+
+static void
+read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
+             enum pipe_format depth_format, qword *depth,
+             qword *stencil)
+{
+   const int ix = x / 2;
+   const int iy = y / 2;
+
+   switch (depth_format) {
+   case PIPE_FORMAT_Z16_UNORM: {
+      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
+
+      const qword shuf_vec = (qword) {
+         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
+      };
+
+
+      /* At even X values we want the first 4 shorts, and at odd X values we
+       * want the second 4 shorts.
+       */
+      qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3));
+      qword bias_mask = si_fsmbi(0x3333);
+      qword sv = si_a(shuf_vec, si_and(bias_mask, bias));
+
+      *depth = si_shufb(*ptr, *ptr, sv);
+      *stencil = si_il(0);
+      break;
+   }
+
+
+   case PIPE_FORMAT_Z32_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+
+      *depth = *ptr;
+      *stencil = si_il(0);
+      break;
+   }
+      
+
+   case PIPE_FORMAT_Z24S8_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword mask = si_fsmbi(0x7777);
+
+      *depth = si_and(*ptr, mask);
+      *stencil = si_rotmai(si_andc(*ptr, mask), -24);
+      break;
+   }
+
+
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+static void
+write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
+              enum pipe_format depth_format,
+              qword depth, qword stencil)
+{
+   const int ix = x / 2;
+   const int iy = y / 2;
+
+   (void) stencil;
+
+   switch (depth_format) {
+   case PIPE_FORMAT_Z16_UNORM: {
+      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
+
+      qword sv = ((ix & 0x01) == 0) 
+          ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15,
+                      24, 25, 26, 27, 28, 29, 30, 31 }
+          : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23,
+                      2, 3, 6, 7, 10, 11, 14, 15 };
+      *ptr = si_shufb(depth, *ptr, sv);
+      break;
+   }
+
+
+   case PIPE_FORMAT_Z32_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      *ptr = depth;
+      break;
+   }
+
+
+   case PIPE_FORMAT_Z24S8_UNORM: {
+      qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      qword mask = si_fsmbi(0x7777);
+
+      stencil = si_rotmai(stencil, 24);
+      *ptr = si_selb(stencil, depth, mask);
+      break;
+   }
+
+
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+qword
+spu_do_depth_stencil(int x, int y,
+                     qword frag_mask, qword frag_depth, qword frag_alpha,
+                     qword facing)
+{
+   struct spu_frag_test_results  result;
+   qword pixel_depth;
+   qword pixel_stencil;
+
+   /* All of this preable code (everthing before the call to frag_test) should
+    * be generated on the PPU and upload to the SPU.
+    */
+   if (spu.read_depth || spu.read_stencil) {
+      read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
+                   &pixel_depth, &pixel_stencil);
+   }
+   
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu)));
+      frag_depth = si_cfltu(frag_depth, 0);
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu)));
+      frag_depth = si_cfltu(frag_depth, 0);
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
+      frag_depth = si_cfltu(frag_depth, 0);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
+                             frag_depth, frag_alpha, facing);
+
+
+   /* This code (everthing after the call to frag_test) should
+    * be generated on the PPU and upload to the SPU.
+    */
+   if (spu.read_depth || spu.read_stencil) {
+      write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
+                    result.depth, result.stencil);
+   }
+
+   return result.mask;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
new file mode 100644
index 0000000000..6571258699
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -0,0 +1,32 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_PER_FRAGMENT_OP
+#define SPU_PER_FRAGMENT_OP
+
+extern qword
+spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
+		     qword frag_alpha, qword facing);
+
+#endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 20e77aa2e6..6df59abd36 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.depth_stencil.depth.enabled) {
+   if (spu.read_depth) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.depth_stencil.depth.enabled) {
+   if (spu.read_depth) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index be9624cf7d..81823f2463 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -38,8 +38,7 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
-
-#include "spu_ztest.h"
+#include "spu_per_fragment_op.h"
 
 
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
@@ -264,16 +263,12 @@ do_depth_test(int x, int y, mask_t quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      int ix = (x - setup.cliprect_minx) / 4;
-      int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask);
-   }
-   else {
-      int ix = (x - setup.cliprect_minx) / 2;
-      int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
-   }
+   mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx,
+					y - setup.cliprect_miny,
+					(qword) quadmask, 
+					(qword) zvals.v,
+					(qword) spu_splats((unsigned char) 0x0ffu),
+					(qword) spu_splats((unsigned int) 0x01u));
 
    if (spu_extract(spu_orx(mask), 0))
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
@@ -299,7 +294,7 @@ emit_quad( int x, int y, mask_t mask )
    sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
 
-   if (spu.depth_stencil.depth.enabled) {
+   if (spu.read_depth) {
       mask = do_depth_test(x, y, mask);
    }
 
@@ -434,7 +429,7 @@ static void flush_spans( void )
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.depth_stencil.depth.enabled) {
+   if (spu.read_depth) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
diff --git a/src/gallium/drivers/cell/spu/spu_ztest.h b/src/gallium/drivers/cell/spu/spu_ztest.h
deleted file mode 100644
index ce8ad00339..0000000000
--- a/src/gallium/drivers/cell/spu/spu_ztest.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * Zbuffer/depth test code.
- */
-
-
-#ifndef SPU_ZTEST_H
-#define SPU_ZTEST_H
-
-
-#ifdef __SPU__
-#include <spu_intrinsics.h>
-#endif
-
-
-
-/**
- * Perform Z testing for a 16-bit/value Z buffer.
- *
- * \param zvals  vector of four fragment zvalues as floats
- * \param zbuf   ptr to vector of ushort[8] zbuffer values.  Note that this
- *               contains the Z values for 2 quads, 8 pixels.
- * \param x      x coordinate of quad (only lsbit is significant)
- * \param inMask indicates which fragments in the quad are alive
- * \return new mask indicating which fragments are alive after ztest
- */
-static INLINE vector unsigned int
-spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
-                  uint x, vector unsigned int inMask)
-{
-#define ZERO 0x80
-   vector unsigned int zvals_ui4, zbuf_ui4, mask;
-
-   /* convert floats to uints in [0, 65535] */
-   zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
-   zvals_ui4 = spu_rlmask(zvals_ui4, -16);  /* right shift 16 */
-
-   /* XXX this conditional could be removed with a bit of work */
-   if (x & 1) {
-      /* convert zbuffer values from ushorts to uints */
-      /* gather lower four ushorts */
-      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
-                             (vector unsigned int) *zbuf,
-                             ((vector unsigned char) {
-                                ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
-                                ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15}));
-      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
-      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
-      /* mask &= inMask */
-      mask = spu_and(mask, inMask);
-      /* zbuf = mask ? zval : zbuf */
-      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
-      /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
-      *zbuf = (vector unsigned short)
-         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     ((vector unsigned char) {
-                        16, 17, 18, 19, 20, 21, 22, 23,
-                        2, 3, 6, 7, 10, 11, 14, 15}));
-   }
-   else {
-      /* convert zbuffer values from ushorts to uints */
-      /* gather upper four ushorts */
-      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
-                             (vector unsigned int) *zbuf,
-                             ((vector unsigned char) {
-                                ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-                                ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7}));
-      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
-      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
-      /* mask &= inMask */
-      mask = spu_and(mask, inMask);
-      /* zbuf = mask ? zval : zbuf */
-      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
-      /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
-      *zbuf = (vector unsigned short)
-         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     ((vector unsigned char) {
-                        2, 3, 6, 7, 10, 11, 14, 15,
-                        24, 25, 26, 27, 28, 29, 30, 31}));
-   }
-   return mask;
-#undef ZERO
-}
-
-
-/**
- * As above, but Zbuffer values as 32-bit uints
- */
-static INLINE vector unsigned int
-spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
-                  vector unsigned int inMask)
-{
-   vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
-
-   /* convert floats to uints in [0, 0xffffffff] */
-   zvals_ui4 = spu_convtu(zvals, 32);
-   /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
-   mask = spu_cmpgt(zbuf, zvals_ui4);
-   /* mask &= inMask */
-   mask = spu_and(mask, inMask);
-   /* zbuf = mask ? zval : zbuf */
-   *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
-
-   return mask;
-}
-
-
-#endif /* SPU_ZTEST_H */
-- 
cgit v1.2.3


From 2b21bde3b1fa6fe357a3a5adc6249e89d6915524 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 19 Mar 2008 17:29:39 -0700
Subject: cell: Use code-gen for alpha blend

So far this is only tested when GL_BLEND is disabled.
---
 src/gallium/drivers/cell/common.h              | 10 +++++
 src/gallium/drivers/cell/ppu/cell_pipe_state.c | 11 +++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 17 +++++--
 src/gallium/drivers/cell/spu/Makefile          |  1 -
 src/gallium/drivers/cell/spu/spu_blend.c       | 62 --------------------------
 src/gallium/drivers/cell/spu/spu_blend.h       | 37 ---------------
 src/gallium/drivers/cell/spu/spu_main.c        | 50 ++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_main.h        | 17 ++++++-
 src/gallium/drivers/cell/spu/spu_tri.c         | 56 +++++++++++++++--------
 9 files changed, 129 insertions(+), 132 deletions(-)
 delete mode 100644 src/gallium/drivers/cell/spu/spu_blend.c
 delete mode 100644 src/gallium/drivers/cell/spu/spu_blend.h

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index fe93fd8e1a..d59e4f7036 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -114,6 +114,16 @@ struct cell_command_depth_stencil_alpha_test {
 };
 
 
+/**
+ * Upload code to perform framebuffer blend operation
+ */
+struct cell_command_blend {
+   uint64_t base;               /**< Effective address of code start. */
+   unsigned size;               /**< Size in bytes of test code. */
+   unsigned read_fb;            /**< Flag: should framebuffer be read? */
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index c880760e4b..4ca8c153b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -54,14 +54,19 @@ cell_create_blend_state(struct pipe_context *pipe,
 
 
 static void
-cell_bind_blend_state(struct pipe_context *pipe, void *blend)
+cell_bind_blend_state(struct pipe_context *pipe, void *state)
 {
    struct cell_context *cell = cell_context(pipe);
+   struct cell_blend_state *blend = (struct cell_blend_state *) state;
+
 
    draw_flush(cell->draw);
 
-   cell->blend = (const struct cell_blend_state *)blend;
+   if ((blend != NULL) && (blend->code.store == NULL)) {
+      cell_generate_depth_stencil_test(blend);
+   }
 
+   cell->blend = blend;
    cell->dirty |= CELL_NEW_BLEND;
 }
 
@@ -70,7 +75,7 @@ static void
 cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_blend_state *cb = (struct cell_blend_state *) blend;
-   
+
    spe_release_func(& cb->code);
    FREE(cb);
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 31cc938f07..5709b48f12 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -65,9 +65,20 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_BLEND) {
-      emit_state_cmd(cell, CELL_CMD_STATE_BLEND,
-                     cell->blend,
-                     sizeof(struct pipe_blend_state));
+      struct cell_command_blend blend;
+
+      if (cell->blend != NULL) {
+         blend.base = (intptr_t) cell->blend->code.store;
+         blend.size = (char *) cell->blend->code.csr
+             - (char *) cell->blend->code.store;
+         blend.read_fb = TRUE;
+      } else {
+         blend.base = 0;
+         blend.size = 0;
+         blend.read_fb = FALSE;
+      }
+
+      emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend));
    }
 
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 115ca8cd90..8e83610790 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -17,7 +17,6 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
-	spu_blend.c \
 	spu_dcache.c \
 	spu_per_fragment_op.c \
 	spu_render.c \
diff --git a/src/gallium/drivers/cell/spu/spu_blend.c b/src/gallium/drivers/cell/spu/spu_blend.c
deleted file mode 100644
index 23ec0eeb45..0000000000
--- a/src/gallium/drivers/cell/spu/spu_blend.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "spu_main.h"
-#include "spu_blend.h"
-#include "spu_colorpack.h"
-
-
-void
-blend_quad(uint itx, uint ity, vector float colors[4])
-{
-   /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */
-   vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]);
-   vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]);
-   vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]);
-   vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]);
-
-   vector float alpha00 = spu_splats(spu_extract(colors[0], 3));
-   vector float alpha01 = spu_splats(spu_extract(colors[1], 3));
-   vector float alpha10 = spu_splats(spu_extract(colors[2], 3));
-   vector float alpha11 = spu_splats(spu_extract(colors[3], 3));
-
-   vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00);
-   vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01);
-   vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10);
-   vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11);
-
-   colors[0] = spu_add(spu_mul(colors[0], alpha00),
-                       spu_mul(fbc00, one_minus_alpha00));
-   colors[1] = spu_add(spu_mul(colors[1], alpha01),
-                       spu_mul(fbc01, one_minus_alpha01));
-   colors[2] = spu_add(spu_mul(colors[2], alpha10),
-                       spu_mul(fbc10, one_minus_alpha10));
-   colors[3] = spu_add(spu_mul(colors[3], alpha11),
-                       spu_mul(fbc11, one_minus_alpha11));
-}
-
diff --git a/src/gallium/drivers/cell/spu/spu_blend.h b/src/gallium/drivers/cell/spu/spu_blend.h
deleted file mode 100644
index 2b594b578b..0000000000
--- a/src/gallium/drivers/cell/spu/spu_blend.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef SPU_BLEND_H
-#define SPU_BLEND_H
-
-
-extern void
-blend_quad(uint itx, uint ity, vector float colors[4]);
-
-
-#endif /* SPU_BLEND_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 937962285d..41bebf5362 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -61,6 +61,25 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX]
 static unsigned char depth_stencil_code_buffer[4 * 64]
     ALIGN16_ATTRIB;
 
+static unsigned char fb_blend_code_buffer[4 * 64]
+    ALIGN16_ATTRIB;
+
+static struct spu_blend_results
+default_blend(qword frag_r, qword frag_g, qword frag_b, qword frag_a,
+              qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+              qword frag_mask)
+{
+   struct spu_blend_results result;
+
+   result.r = si_selb(pixel_r, frag_r, frag_mask);
+   result.g = si_selb(pixel_g, frag_g, frag_mask);
+   result.b = si_selb(pixel_b, frag_b, frag_mask);
+   result.a = si_selb(pixel_a, frag_a, frag_mask);
+
+   return result;
+}
+
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -246,14 +265,31 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 
 
 static void
-cmd_state_blend(const struct pipe_blend_state *state)
+cmd_state_blend(const struct cell_command_blend *state)
 {
    if (Debug)
       printf("SPU %u: BLEND: enabled %d\n",
              spu.init.id,
-             state->blend_enable);
+             (state->size != 0));
+
+   ASSERT_ALIGN16(state->base);
 
-   memcpy(&spu.blend, state, sizeof(*state));
+   if (state->size != 0) {
+      mfc_get(fb_blend_code_buffer,
+              (unsigned int) state->base,  /* src */
+              ROUNDUP16(state->size),
+              TAG_BATCH_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+      wait_on_mask(1 << TAG_BATCH_BUFFER);
+      spu.blend = (blend_func) fb_blend_code_buffer;
+      spu.read_fb = state->read_fb;
+   } else {
+      /* If there is no code, use the default;
+       */
+      spu.blend = default_blend;
+      spu.read_fb = FALSE;
+   }
 }
 
 
@@ -441,9 +477,8 @@ cmd_batch(uint opcode)
          pos += 1;
          break;
       case CELL_CMD_STATE_BLEND:
-         cmd_state_blend((struct pipe_blend_state *)
-                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
+         cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8);
          break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
@@ -587,6 +622,9 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
+
+   spu.blend = default_blend;
+   spu.read_fb = FALSE;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 444e218645..56d0968676 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -67,6 +67,18 @@ typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
     qword frag_alpha, qword facing);
 
 
+struct spu_blend_results {
+   qword r;
+   qword g;
+   qword b;
+   qword a;
+};
+
+typedef struct spu_blend_results (*blend_func)(
+    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
+    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+    qword frag_mask);
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -93,7 +105,10 @@ struct spu_global
    boolean read_depth;
    boolean read_stencil;
    frag_test_func frag_test;
-   struct pipe_blend_state blend;
+   
+   boolean read_fb;
+   blend_func blend;
+
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 81823f2463..c9f8cadcda 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -29,10 +29,10 @@
  * Triangle rendering within a tile.
  */
 
+#include <transpose_matrix4x4.h>
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
-#include "spu_blend.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -326,27 +326,45 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(1, (float) x, (float) y, colors);
       }
 
-#if 1
-      if (spu.blend.blend_enable)
-         blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
-#endif
 
-      if (spu_extract(mask, 0))
-         spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
-      if (spu_extract(mask, 1))
-         spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
-      if (spu_extract(mask, 2))
-         spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
-      if (spu_extract(mask, 3))
-         spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
+      /* Read the current framebuffer values.
+       *
+       * Ignore read_fb for now.  In the future we can use this to avoid
+       * reading the framebuffer if read_fb is false and the fragment mask is
+       * all 0xffffffff.  This is the common case, so it is probably worth
+       * the effort.  We'll have to profile to determine whether or not the
+       * extra conditional branches hurt overall performance.
+       */
+      vec_float4 aos_pix[4] = {
+         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
+         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
+         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
+         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+      };
 
-#if 0
-      /* SIMD_Z with swizzled color buffer (someday) */
-      vector unsigned int uicolors = *((vector unsigned int *) &colors);
-      spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
-#endif
-   }
+      qword soa_pix[4];
+      qword soa_frag[4];
 
+      /* Convert pixel and fragment data from AoS to SoA format.
+       */
+      _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
+      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+
+      const struct spu_blend_results result =
+          (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
+                       soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
+                       (qword) mask);
+
+
+      /* Convert final pixel data from SoA to AoS format.
+       */
+      _transpose_matrix4x4(aos_pix, (const vec_float4 *) &result);
+
+      spu.ctile.ui[iy+0][ix+0] = spu_pack_color_shuffle(aos_pix[0], shuffle);
+      spu.ctile.ui[iy+0][ix+1] = spu_pack_color_shuffle(aos_pix[1], shuffle);
+      spu.ctile.ui[iy+1][ix+0] = spu_pack_color_shuffle(aos_pix[2], shuffle);
+      spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(aos_pix[3], shuffle);
+   }
 #endif
 }
 
-- 
cgit v1.2.3


From 600499cf888fee9a91ff3106beca939ea0c7b2bd Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 21 Mar 2008 11:15:49 -0700
Subject: cell: Change code-gen for CONST_COLOR blend factor

Previously the constant color blend factor was compiled into the
generated code.  This meant that the code had to be regenerated each
time the constant color was changed.  This doesn't fit with the model
used in Gallium.

As-is, the code could be better.  The constant color is loaded for
every quad processed, even if it is not used.  Also, if a lot of (1-x)
blend factors are used, 1.0 will be loaded and reloaded into registers
many times.
---
 src/gallium/drivers/cell/ppu/cell_pipe_state.c     |  2 +-
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 93 +++++++++++-----------
 .../drivers/cell/ppu/cell_state_per_fragment.h     |  3 +-
 src/gallium/drivers/cell/spu/spu_main.h            |  2 +
 src/gallium/drivers/cell/spu/spu_tri.c             |  2 +
 5 files changed, 53 insertions(+), 49 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 86fcdcff1f..d956d6fad4 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -63,7 +63,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state)
    draw_flush(cell->draw);
 
    if ((blend != NULL) && (blend->code.store == NULL)) {
-      cell_generate_alpha_blend(blend, &cell->blend_color);
+      cell_generate_alpha_blend(blend);
    }
 
    cell->blend = blend;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index f353aeab0a..c750b1d89d 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -588,19 +588,13 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
  */
 static int
 emit_alpha_factor_calculation(struct spe_function *f,
-                              unsigned factor, float const_alpha,
-                              int src_alpha, int dst_alpha)
+                              unsigned factor,
+                              int src_alpha, int dst_alpha, int const_alpha)
 {
-   union {
-      float f;
-      unsigned u;
-   } alpha;
    int factor_reg;
    int tmp;
 
 
-   alpha.f = const_alpha;
-
    switch (factor) {
    case PIPE_BLENDFACTOR_ONE:
       factor_reg = -1;
@@ -621,13 +615,17 @@ emit_alpha_factor_calculation(struct spe_function *f,
       break;
 
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      const_alpha = 1.0 - const_alpha;
-      /* FALLTHROUGH */
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
       factor_reg = spe_allocate_available_register(f);
 
-      spe_il(f, factor_reg, alpha.u & 0x0ffff);
-      spe_ilh(f, factor_reg, alpha.u >> 16);
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, const_alpha);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor_reg = const_alpha;
       break;
 
    case PIPE_BLENDFACTOR_ZERO:
@@ -674,24 +672,15 @@ emit_alpha_factor_calculation(struct spe_function *f,
 static void
 emit_color_factor_calculation(struct spe_function *f,
                               unsigned sF, unsigned mask,
-                              const struct pipe_blend_color *blend_color,
                               const int *src,
                               const int *dst,
+                              const int *const_color,
                               int *factor)
 {
-   union {
-      float f[4];
-      unsigned u[4];
-   } color;
    int tmp;
    unsigned i;
 
 
-   color.f[0] = blend_color->color[0];
-   color.f[1] = blend_color->color[1];
-   color.f[2] = blend_color->color[2];
-   color.f[3] = blend_color->color[3];
-
    factor[0] = -1;
    factor[1] = -1;
    factor[2] = -1;
@@ -748,29 +737,40 @@ emit_color_factor_calculation(struct spe_function *f,
       break;
 
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      color.f[0] = 1.0 - color.f[0];
-      color.f[1] = 1.0 - color.f[1];
-      color.f[2] = 1.0 - color.f[2];
-      /* FALLTHROUGH */
-   case PIPE_BLENDFACTOR_CONST_COLOR:
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
       for (i = 0; i < 3; i++) {
          factor[i] = spe_allocate_available_register(f);
 
-         spe_il(f, factor[i], color.u[i] & 0x0ffff);
-         spe_ilh(f, factor[i], color.u[i] >> 16);
+         spe_fs(f, factor[i], tmp, const_color[i]);
+      }
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      for (i = 0; i < 3; i++) {
+         factor[i] = const_color[i];
       }
       break;
 
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      color.f[3] = 1.0 - color.f[3];
-      /* FALLTHROUGH */
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
       factor[0] = spe_allocate_available_register(f);
       factor[1] = factor[0];
       factor[2] = factor[0];
 
-      spe_il(f, factor[0], color.u[3] & 0x0ffff);
-      spe_ilh(f, factor[0], color.u[3] >> 16);
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, const_color[3]);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor[0] = const_color[3];
+      factor[1] = factor[0];
+      factor[2] = factor[0];
       break;
 
    case PIPE_BLENDFACTOR_ZERO:
@@ -945,8 +945,7 @@ emit_blend_calculation(struct spe_function *f,
  * Generate code to perform alpha blending on the SPE
  */
 void
-cell_generate_alpha_blend(struct cell_blend_state *cb,
-                          const struct pipe_blend_color *blend_color)
+cell_generate_alpha_blend(struct cell_blend_state *cb)
 {
    struct pipe_blend_state *const b = &cb->base;
    struct spe_function *const f = &cb->code;
@@ -972,7 +971,13 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
       spe_allocate_register(f, 9),
       spe_allocate_register(f, 10),
    };
-   const int mask = spe_allocate_register(f, 11);
+   const int const_color[4] = {
+      spe_allocate_register(f, 11),
+      spe_allocate_register(f, 12),
+      spe_allocate_register(f, 13),
+      spe_allocate_register(f, 14),
+   };
+   const int mask = spe_allocate_register(f, 15);
    unsigned func[4];
    unsigned sF[4];
    unsigned dF[4];
@@ -1053,8 +1058,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
     * the alpha factor, calculate the alpha factor.
     */
    if (((b->colormask & 8) != 0) && need_alpha_factor) {
-      src_factor[3] = emit_alpha_factor_calculation(f, sF[3],
-                                                    blend_color->color[3],
+      src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3],
                                                     frag[3], pixel[3]);
 
       /* If the alpha destination blend factor is the same as the alpha source
@@ -1062,8 +1066,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
        */
       dst_factor[3] = (dF[3] == sF[3])
           ? src_factor[3]
-          : emit_alpha_factor_calculation(f, dF[3],
-                                          blend_color->color[3],
+          : emit_alpha_factor_calculation(f, dF[3], const_color[3],
                                           frag[3], pixel[3]);
    }
 
@@ -1080,8 +1083,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
       emit_color_factor_calculation(f,
                                     b->rgb_src_factor,
                                     b->colormask,
-                                    blend_color,
-                                    frag, pixel, src_factor);
+                                    frag, pixel, const_color, src_factor);
    }
 
 
@@ -1101,8 +1103,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb,
       emit_color_factor_calculation(f,
                                     b->rgb_dst_factor,
                                     b->colormask,
-                                    blend_color,
-                                    frag, pixel, dst_factor);
+                                    frag, pixel, const_color, dst_factor);
    }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
index 541c3b3be0..f699247f9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -29,7 +29,6 @@ extern void
 cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
 
 extern void
-cell_generate_alpha_blend(struct cell_blend_state *cb,
-			  const struct pipe_blend_color *blend_color);
+cell_generate_alpha_blend(struct cell_blend_state *cb);
 
 #endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 56d0968676..49f5d99674 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -77,6 +77,7 @@ struct spu_blend_results {
 typedef struct spu_blend_results (*blend_func)(
     qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+    qword const_r, qword const_g, qword const_b, qword const_a,
     qword frag_mask);
 
 struct spu_framebuffer {
@@ -108,6 +109,7 @@ struct spu_global
    
    boolean read_fb;
    blend_func blend;
+   qword const_blend_color[4] ALIGN16_ATTRIB;
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index c4272d6e93..e6a1ce01df 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -356,6 +356,8 @@ emit_quad( int x, int y, mask_t mask )
       const struct spu_blend_results result =
           (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
                        soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
+                       spu.const_blend_color[0], spu.const_blend_color[1],
+                       spu.const_blend_color[2], spu.const_blend_color[3],
                        (qword) mask);
 
 
-- 
cgit v1.2.3


From 92126cea846959bb2152905a7712753d1114bd6b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 26 Mar 2008 10:45:32 -0700
Subject: cell: Implement code-gen for logic op

This also implements code-gen for the float-to-packed color
conversion.  It's currently hardcoded for A8R8G8B8, but that can
easily be fixed as soon as other color depths are supported by the
Cell driver.
---
 src/gallium/drivers/cell/common.h                  |  11 +-
 src/gallium/drivers/cell/ppu/cell_context.h        |   2 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  17 ++
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 261 ++++++++++++++++++++-
 .../drivers/cell/ppu/cell_state_per_fragment.h     |   4 +
 src/gallium/drivers/cell/spu/spu_main.c            |  19 ++
 src/gallium/drivers/cell/spu/spu_main.h            |   9 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  59 +++--
 8 files changed, 349 insertions(+), 33 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d59e4f7036..b0928fefd2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -92,8 +92,9 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_BLEND         19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
-#define CELL_CMD_VS_EXECUTE          21
-#define CELL_CMD_FLUSH_BUFFER_RANGE  22
+#define CELL_CMD_STATE_LOGICOP       21
+#define CELL_CMD_VS_EXECUTE          22
+#define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
 
 #define CELL_NUM_BUFFERS 4
@@ -124,6 +125,12 @@ struct cell_command_blend {
 };
 
 
+struct cell_command_logicop {
+   uint64_t base;               /**< Effective address of code start. */
+   unsigned size;               /**< Size in bytes of test code. */
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 9e79db0ace..0442abddc1 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -92,6 +92,8 @@ struct cell_context
    const struct cell_vertex_shader_state *vs;
    const struct cell_fragment_shader_state *fs;
 
+   struct spe_function logic_op;
+
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
    struct pipe_constant_buffer constants[2];
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 5709b48f12..5c1310d0b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -50,6 +50,23 @@ emit_state_cmd(struct cell_context *cell, uint cmd,
 void
 cell_emit_state(struct cell_context *cell)
 {
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) {
+      struct cell_command_logicop logicop;
+
+      if (cell->logic_op.store != NULL) {
+	 spe_release_func(& cell->logic_op);
+      }
+
+      cell_generate_logic_op(& cell->logic_op,
+			     & cell->blend->base,
+			     cell->framebuffer.cbufs[0]);
+
+      logicop.base = (intptr_t) cell->logic_op.store;
+      logicop.size = 64 * 4;
+      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop,
+		     sizeof(logicop));
+   }
+
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index c750b1d89d..f10025bd7c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -977,7 +977,6 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
       spe_allocate_register(f, 13),
       spe_allocate_register(f, 14),
    };
-   const int mask = spe_allocate_register(f, 15);
    unsigned func[4];
    unsigned sF[4];
    unsigned dF[4];
@@ -1114,9 +1113,6 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
                                 func[i], sF[i], dF[i],
                                 frag[i], src_factor[i],
                                 pixel[i], dst_factor[i]);
-         spe_selb(f, frag[i], pixel[i], frag[i], mask);
-      } else {
-         spe_or(f, frag[i], pixel[i], pixel[i]);
       }
    }
 
@@ -1146,3 +1142,260 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
    }
 #endif
 }
+
+
+int PC_OFFSET(const struct spe_function *f, const void *d)
+{
+   const intptr_t pc = (intptr_t) f->csr;
+   const intptr_t ea = ~0x0f & (intptr_t) d;
+
+   return (ea - pc) >> 2;
+}
+
+
+/**
+ * Generate code to perform color conversion and logic op
+ *
+ * \bug
+ * The code generated by this function should also perform dithering.
+ *
+ * \bug
+ * The code generated by this function should also perform color-write
+ * masking.
+ *
+ * \bug
+ * This routine is hard-coded to only work with ARGB8 data.
+ */
+void
+cell_generate_logic_op(struct spe_function *f, struct pipe_blend_state *blend,
+                       struct pipe_surface *surf)
+{
+   const unsigned logic_op = (blend->logicop_enable)
+       ? blend->logicop_func : PIPE_LOGICOP_COPY;
+
+   /* This code generates a maximum of 37 instructions.  An additional 32
+    * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
+    * to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, 4 * 64);
+
+
+   /* Pixel colors in framebuffer format in AoS layout.
+    */
+   const int pixel[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+
+   /* Fragment colors stored as floats in SoA layout.
+    */
+   const int frag[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+
+   const int mask = spe_allocate_register(f, 11);
+
+
+   /* Short-circuit the noop and invert cases.
+    */
+   if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) {
+      spe_bi(f, 0, 0, 0);
+      return;
+   } else if (logic_op == PIPE_LOGICOP_INVERT) {
+      spe_nor(f, pixel[0], pixel[0], pixel[0]);
+      spe_nor(f, pixel[1], pixel[1], pixel[1]);
+      spe_nor(f, pixel[2], pixel[2], pixel[2]);
+      spe_nor(f, pixel[3], pixel[3], pixel[3]);
+      spe_bi(f, 0, 0, 0);
+      return;
+   }
+
+
+   const int tmp[4] = {
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+   };
+
+   const int shuf_xpose_hi = spe_allocate_available_register(f);
+   const int shuf_xpose_lo = spe_allocate_available_register(f);
+   const int shuf_color = spe_allocate_available_register(f);
+
+
+   /* Pointer to the begining of the function's private data area.
+    */
+   uint32_t *const data = ((uint32_t *) f->store) + (64 - 8);
+
+
+   /* Convert fragment colors to framebuffer format in AoS layout.
+    */
+   data[0] = 0x00010203;
+   data[1] = 0x10111213;
+   data[2] = 0x04050607;
+   data[3] = 0x14151617;
+
+   data[4] = 0x0c000408;
+   data[5] = 0x80808080;
+   data[6] = 0x80808080;
+   data[7] = 0x80808080;
+
+   spe_ilh(f, tmp[0], 0x0808);
+   spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0));
+   spe_lqr(f, shuf_color, PC_OFFSET(f, data+4));
+   spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]);
+
+   spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi);
+   spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo);
+   spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi);
+   spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo);
+
+   spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi);
+   spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo);
+   spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi);
+   spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo);
+
+   spe_cfltu(f, frag[0], frag[0], 32);
+   spe_cfltu(f, frag[1], frag[1], 32);
+   spe_cfltu(f, frag[2], frag[2], 32);
+   spe_cfltu(f, frag[3], frag[3], 32);
+
+   spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color);
+   spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color);
+   spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color);
+   spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color);
+
+
+   /* If logic op is enabled, perform the requested logical operation on the
+    * converted fragment colors and the pixel colors.
+    */
+   switch (logic_op) {
+   case PIPE_LOGICOP_CLEAR:
+      spe_il(f, frag[0], 0);
+      spe_il(f, frag[1], 0);
+      spe_il(f, frag[2], 0);
+      spe_il(f, frag[3], 0);
+      break;
+   case PIPE_LOGICOP_NOR:
+      spe_nor(f, frag[0], frag[0], pixel[0]);
+      spe_nor(f, frag[1], frag[1], pixel[1]);
+      spe_nor(f, frag[2], frag[2], pixel[2]);
+      spe_nor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      spe_andc(f, frag[0], pixel[0], frag[0]);
+      spe_andc(f, frag[1], pixel[1], frag[1]);
+      spe_andc(f, frag[2], pixel[2], frag[2]);
+      spe_andc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      spe_nor(f, frag[0], frag[0], frag[0]);
+      spe_nor(f, frag[1], frag[1], frag[1]);
+      spe_nor(f, frag[2], frag[2], frag[2]);
+      spe_nor(f, frag[3], frag[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      spe_andc(f, frag[0], frag[0], pixel[0]);
+      spe_andc(f, frag[1], frag[1], pixel[1]);
+      spe_andc(f, frag[2], frag[2], pixel[2]);
+      spe_andc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_XOR:
+      spe_xor(f, frag[0], frag[0], pixel[0]);
+      spe_xor(f, frag[1], frag[1], pixel[1]);
+      spe_xor(f, frag[2], frag[2], pixel[2]);
+      spe_xor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_NAND:
+      spe_nand(f, frag[0], frag[0], pixel[0]);
+      spe_nand(f, frag[1], frag[1], pixel[1]);
+      spe_nand(f, frag[2], frag[2], pixel[2]);
+      spe_nand(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND:
+      spe_and(f, frag[0], frag[0], pixel[0]);
+      spe_and(f, frag[1], frag[1], pixel[1]);
+      spe_and(f, frag[2], frag[2], pixel[2]);
+      spe_and(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      spe_eqv(f, frag[0], frag[0], pixel[0]);
+      spe_eqv(f, frag[1], frag[1], pixel[1]);
+      spe_eqv(f, frag[2], frag[2], pixel[2]);
+      spe_eqv(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      spe_orc(f, frag[0], pixel[0], frag[0]);
+      spe_orc(f, frag[1], pixel[1], frag[1]);
+      spe_orc(f, frag[2], pixel[2], frag[2]);
+      spe_orc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY:
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      spe_orc(f, frag[0], frag[0], pixel[0]);
+      spe_orc(f, frag[1], frag[1], pixel[1]);
+      spe_orc(f, frag[2], frag[2], pixel[2]);
+      spe_orc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR:
+      spe_or(f, frag[0], frag[0], pixel[0]);
+      spe_or(f, frag[1], frag[1], pixel[1]);
+      spe_or(f, frag[2], frag[2], pixel[2]);
+      spe_or(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_SET:
+      spe_il(f, frag[0], ~0);
+      spe_il(f, frag[1], ~0);
+      spe_il(f, frag[2], ~0);
+      spe_il(f, frag[3], ~0);
+      break;
+
+   /* These two cases are short-circuited above.
+    */
+   case PIPE_LOGICOP_INVERT:
+   case PIPE_LOGICOP_NOOP:
+   default:
+      assert(0);
+   }
+
+
+   /* Apply fragment mask.
+    */
+   spe_ilh(f, tmp[0], 0x0000);
+   spe_ilh(f, tmp[1], 0x0404);
+   spe_ilh(f, tmp[2], 0x0808);
+   spe_ilh(f, tmp[3], 0x0c0c);
+
+   spe_shufb(f, tmp[0], mask, mask, tmp[0]);
+   spe_shufb(f, tmp[1], mask, mask, tmp[1]);
+   spe_shufb(f, tmp[2], mask, mask, tmp[2]);
+   spe_shufb(f, tmp[3], mask, mask, tmp[3]);
+
+   spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]);
+   spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]);
+   spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]);
+   spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]);
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# %u instructions\n", f->csr - f->store);
+
+      printf("\t.text\n");
+      for (i = 0; i < 64; i++) {
+         printf("\t.long\t0x%04x\n", p[i]);
+      }
+      fflush(stdout);
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
index f699247f9e..ab4de96c69 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -31,4 +31,8 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
 extern void
 cell_generate_alpha_blend(struct cell_blend_state *cb);
 
+extern void
+cell_generate_logic_op(struct spe_function *f, struct pipe_blend_state *blend,
+    struct pipe_surface *surf);
+
 #endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 0a490ab277..fccff01e10 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -64,6 +64,9 @@ static unsigned char depth_stencil_code_buffer[4 * 64]
 static unsigned char fb_blend_code_buffer[4 * 64]
     ALIGN16_ATTRIB;
 
+static unsigned char logicop_code_buffer[4 * 64]
+    ALIGN16_ATTRIB;
+
 
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
@@ -513,6 +516,22 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
       }
+      case CELL_CMD_STATE_LOGICOP: {
+         struct cell_command_logicop *code =
+             (struct cell_command_logicop *) &buffer[pos+1];
+
+              mfc_get(logicop_code_buffer,
+                      (unsigned int) code->base,  /* src */
+                      code->size,
+                      TAG_BATCH_BUFFER,
+                      0, /* tid */
+                      0  /* rid */);
+         wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+	 spu.logicop = (logicop_func) logicop_code_buffer;
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
+         break;
+      }
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 49f5d99674..c20452931a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -77,9 +77,14 @@ struct spu_blend_results {
 typedef struct spu_blend_results (*blend_func)(
     qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a,
+    qword const_r, qword const_g, qword const_b, qword const_a);
+
+typedef struct spu_blend_results (*logicop_func)(
+    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword frag_mask);
 
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -111,6 +116,8 @@ struct spu_global
    blend_func blend;
    qword const_blend_color[4] ALIGN16_ATTRIB;
 
+   logicop_func logicop;
+
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index e6a1ce01df..95c629a8aa 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -305,7 +305,6 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      const vector unsigned char shuffle = spu.color_shuffle;
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
@@ -330,45 +329,53 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
+      /* Convert fragment data from AoS to SoA format.
+       */
+      qword soa_frag[4];
+      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+
       /* Read the current framebuffer values.
-       *
-       * Ignore read_fb for now.  In the future we can use this to avoid
-       * reading the framebuffer if read_fb is false and the fragment mask is
-       * all 0xffffffff.  This is the common case, so it is probably worth
-       * the effort.  We'll have to profile to determine whether or not the
-       * extra conditional branches hurt overall performance.
        */
-      vec_float4 aos_pix[4] = {
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+      const qword pix[4] = {
+         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]),
+         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]),
+         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]),
+         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]),
       };
 
       qword soa_pix[4];
-      qword soa_frag[4];
 
-      /* Convert pixel and fragment data from AoS to SoA format.
-       */
-      _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
-      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+      if (spu.read_fb) {
+         /* Convert pixel data from AoS to SoA format.
+          */
+         vec_float4 aos_pix[4] = {
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+         };
+
+         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
+      }
 
-      const struct spu_blend_results result =
+
+      struct spu_blend_results result =
           (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
                        soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
                        spu.const_blend_color[0], spu.const_blend_color[1],
-                       spu.const_blend_color[2], spu.const_blend_color[3],
-                       (qword) mask);
+                       spu.const_blend_color[2], spu.const_blend_color[3]);
 
 
       /* Convert final pixel data from SoA to AoS format.
        */
-      _transpose_matrix4x4(aos_pix, (const vec_float4 *) &result);
-
-      spu.ctile.ui[iy+0][ix+0] = spu_pack_color_shuffle(aos_pix[0], shuffle);
-      spu.ctile.ui[iy+0][ix+1] = spu_pack_color_shuffle(aos_pix[1], shuffle);
-      spu.ctile.ui[iy+1][ix+0] = spu_pack_color_shuffle(aos_pix[2], shuffle);
-      spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(aos_pix[3], shuffle);
+      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
+                              result.r, result.g, result.b, result.a,
+                              (qword) mask);
+
+      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0);
+      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
+      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
+      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
    }
 #endif
 }
-- 
cgit v1.2.3


From 14452aee73e16f2ede075cf894e69d62cc539f5e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 31 Mar 2008 17:38:21 -0600
Subject: cell: initial work to support multi-texture

---
 src/gallium/drivers/cell/common.h              |  8 +++++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 18 +++++++---------
 src/gallium/drivers/cell/spu/spu_main.c        | 27 ++++++++++++++++--------
 src/gallium/drivers/cell/spu/spu_main.h        |  8 +++----
 src/gallium/drivers/cell/spu/spu_texture.c     | 29 ++++++++++++++++----------
 src/gallium/drivers/cell/spu/spu_tri.c         |  2 +-
 6 files changed, 55 insertions(+), 37 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c9e873b35c..298812fc20 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -66,6 +66,8 @@
 
 #define CELL_MAX_SPUS 6
 
+#define CELL_MAX_SAMPLERS 4
+
 #define TILE_SIZE 32
 
 
@@ -228,8 +230,10 @@ struct cell_command_release_verts
 
 struct cell_command_texture
 {
-   void *start;         /**< Address in main memory */
-   uint width, height;
+   struct {
+      void *start;         /**< Address in main memory */
+      ushort width, height;
+   } texture[CELL_MAX_SAMPLERS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 4c75caa025..4fbe1a21b8 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -129,17 +129,15 @@ cell_emit_state(struct cell_context *cell)
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
       struct cell_command_texture texture;
-      if (cell->texture[0]) {
-         texture.start = cell->texture[0]->tiled_data;
-         texture.width = cell->texture[0]->base.width[0];
-         texture.height = cell->texture[0]->base.height[0];
+      uint i;
+      memset(&texture, 0, sizeof(texture));
+      for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->texture[i]) {
+            texture.texture[i].start = cell->texture[i]->tiled_data;
+            texture.texture[i].width = cell->texture[i]->base.width[0];
+            texture.texture[i].height = cell->texture[i]->base.height[0];
+         }
       }
-      else {
-         texture.start = NULL;
-         texture.width = 0;
-         texture.height = 0;
-      }
-
       emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
                      &texture, sizeof(struct cell_command_texture));
    }
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index d7f46f8024..80fa5f7859 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -329,17 +329,26 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
-   if (Debug)
-      printf("SPU %u: TEXTURE at %p  size %u x %u\n",
-             spu.init.id, texture->start, texture->width, texture->height);
+   uint i;
+
+   if (1||Debug) {
+      printf("SPU %u: TEXTURE\n", spu.init.id);
+      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+         printf("  %d: at %p  size %u x %u\n", i, texture->texture[i].start,
+                texture->texture[i].width, texture->texture[i].height);
+      }
+   }
 
    memcpy(&spu.texture, texture, sizeof(*texture));
-   spu.tex_size = (vector float)
-      { spu.texture.width, spu.texture.height, 0.0, 0.0};
-   spu.tex_size_mask = (vector unsigned int)
-      { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
-   spu.tex_size_x_mask = spu_splats(spu.texture.width - 1);
-   spu.tex_size_y_mask = spu_splats(spu.texture.height - 1);
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      const uint width = texture->texture[i].width;
+      const uint height = texture->texture[i].height;
+      spu.tex_size[i] = (vector float) { width, height, 0.0, 0.0};
+      spu.tex_size_mask[i] = (vector unsigned int)
+         { width - 1, height - 1, 0, 0 };
+      spu.tex_size_x_mask[i] = spu_splats(width - 1);
+      spu.tex_size_y_mask[i] = spu_splats(height - 1);
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index c20452931a..8a87787537 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -141,10 +141,10 @@ struct spu_global
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
 
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   vector float tex_size[CELL_MAX_SAMPLERS];
+   vector unsigned int tex_size_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
+   vector unsigned int tex_size_x_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
+   vector unsigned int tex_size_y_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
 
    vector float (*sample_texture)(vector float texcoord);
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 67eb08196a..91a6aec5ec 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -40,25 +40,29 @@
 void
 invalidate_tex_cache(void)
 {
-   spu_dcache_mark_dirty((unsigned) spu.texture.start,
-                         4 * spu.texture.width * spu.texture.height);
+   uint unit = 0;
+   uint bytes = 4 * spu.texture.texture[unit].width
+      * spu.texture.texture[unit].height;
+
+   spu_dcache_mark_dirty((unsigned) spu.texture.texture[unit].start, bytes);
 }
 
 
 static uint
 get_texel(vec_uint4 coordinate)
 {
+   const uint unit = 0;
    vec_uint4 tmp;
    unsigned x = spu_extract(coordinate, 0);
    unsigned y = spu_extract(coordinate, 1);
-   const unsigned tiles_per_row = spu.texture.width / TILE_SIZE;
+   const unsigned tiles_per_row = spu.texture.texture[unit].width / TILE_SIZE;
    unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
                                             + (x / TILE_SIZE));
    unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
                                 + (x % TILE_SIZE));
 
    spu_dcache_fetch_unaligned((qword *) & tmp,
-                              spu.texture.start + tile_offset + texel_offset,
+                              spu.texture.texture[unit].start + tile_offset + texel_offset,
                               4);
    return spu_extract(tmp, 0);
 }
@@ -67,13 +71,14 @@ get_texel(vec_uint4 coordinate)
 static void
 get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture.start;
+   const uint unit = 0;
+   const unsigned texture_ea = (uintptr_t) spu.texture.texture[unit].start;
    vec_uint4 tile_x = spu_rlmask(x, -5);
    vec_uint4 tile_y = spu_rlmask(y, -5);
    const qword offset_x = si_andi((qword) x, 0x1f);
    const qword offset_y = si_andi((qword) y, 0x1f);
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture.width / TILE_SIZE);
+   const qword tiles_per_row = (qword) spu_splats(spu.texture.texture[unit].width / TILE_SIZE);
    const qword tile_size = (qword) spu_splats(sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -101,9 +106,10 @@ get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 vector float
 sample_texture_nearest(vector float texcoord)
 {
-   vector float tc = spu_mul(texcoord, spu.tex_size);
+   const uint unit = 0;
+   vector float tc = spu_mul(texcoord, spu.tex_size[unit]);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc = spu_and(itc, spu.tex_size_mask[unit]);        /* mask (GL_REPEAT) */
    uint texel = get_texel(itc);
    return spu_unpack_A8R8G8B8(texel);
 }
@@ -112,10 +118,11 @@ sample_texture_nearest(vector float texcoord)
 vector float
 sample_texture_bilinear(vector float texcoord)
 {
+   const uint unit = 0;
    static const vec_uint4 offset_x = {0, 0, 1, 1};
    static const vec_uint4 offset_y = {0, 1, 0, 1};
 
-   vector float tc = spu_mul(texcoord, spu.tex_size);
+   vector float tc = spu_mul(texcoord, spu.tex_size[unit]);
    tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
 
    /* integer texcoords S,T: */
@@ -129,8 +136,8 @@ sample_texture_bilinear(vector float texcoord)
    x = spu_add(x, offset_x);
    y = spu_add(y, offset_y);
 
-   x = spu_and(x, spu.tex_size_x_mask);
-   y = spu_and(y, spu.tex_size_y_mask);
+   x = spu_and(x, spu.tex_size_x_mask[unit]);
+   y = spu_and(y, spu.tex_size_y_mask[unit]);
 
    get_four_texels(x, y, texels);
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 95c629a8aa..9f63317b1f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -309,7 +309,7 @@ emit_quad( int x, int y, mask_t mask )
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture.start) {
+      if (spu.texture.texture[0].start) {
          /* texture mapping */
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
-- 
cgit v1.2.3


From e6c981f22c0b6469ef44e9d7a34113db34647fef Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 31 Mar 2008 21:09:02 -0600
Subject: cell: more work for multi-texture support

---
 src/gallium/drivers/cell/common.h              | 17 ++++++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 31 ++++++++++-----
 src/gallium/drivers/cell/spu/spu_main.c        | 55 +++++++++++++++-----------
 src/gallium/drivers/cell/spu/spu_main.h        | 18 ++++++---
 src/gallium/drivers/cell/spu/spu_texture.c     | 24 +++++------
 src/gallium/drivers/cell/spu/spu_tri.c         |  2 +-
 6 files changed, 90 insertions(+), 57 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 298812fc20..f430e88b9c 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -36,6 +36,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_util.h"
 #include "pipe/p_format.h"
+#include "pipe/p_state.h"
 
 
 /** The standard assert macro doesn't seem to work reliably */
@@ -228,12 +229,20 @@ struct cell_command_release_verts
 };
 
 
+struct cell_command_sampler
+{
+   uint64_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   uint unit;
+   struct pipe_sampler_state state;
+};
+
+
 struct cell_command_texture
 {
-   struct {
-      void *start;         /**< Address in main memory */
-      ushort width, height;
-   } texture[CELL_MAX_SAMPLERS];
+   uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint unit;
+   void *start;         /**< Address in main memory */
+   ushort width, height;
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 4fbe1a21b8..9cae67f091 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -121,25 +121,36 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
-      if (cell->sampler[0]) {
-         emit_state_cmd(cell, CELL_CMD_STATE_SAMPLER,
-                        cell->sampler[0], sizeof(struct pipe_sampler_state));
+      uint i;
+      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->sampler[i]) {
+            struct cell_command_sampler *sampler
+               = cell_batch_alloc(cell, sizeof(*sampler));
+            sampler->opcode = CELL_CMD_STATE_SAMPLER;
+            sampler->unit = i;
+            sampler->state = *cell->sampler[i];
+         }
       }
    }
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
-      struct cell_command_texture texture;
       uint i;
-      memset(&texture, 0, sizeof(texture));
       for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
+         struct cell_command_texture *texture
+            =  cell_batch_alloc(cell, sizeof(*texture));
+         texture->opcode = CELL_CMD_STATE_TEXTURE;
+         texture->unit = i;
          if (cell->texture[i]) {
-            texture.texture[i].start = cell->texture[i]->tiled_data;
-            texture.texture[i].width = cell->texture[i]->base.width[0];
-            texture.texture[i].height = cell->texture[i]->base.height[0];
+            texture->start = cell->texture[i]->tiled_data;
+            texture->width = cell->texture[i]->base.width[0];
+            texture->height = cell->texture[i]->base.height[0];
+         }
+         else {
+            texture->start = NULL;
+            texture->width = 1;
+            texture->height = 1;
          }
       }
-      emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
-                     &texture, sizeof(struct cell_command_texture));
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 80fa5f7859..7f0473d198 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -312,13 +312,13 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
 
 
 static void
-cmd_state_sampler(const struct pipe_sampler_state *state)
+cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
    if (Debug)
-      printf("SPU %u: SAMPLER\n",
-             spu.init.id);
+      printf("SPU %u: SAMPLER [%u]\n",
+             spu.init.id, sampler->unit);
 
-   memcpy(&spu.sampler[0], state, sizeof(*state));
+   spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
       spu.sample_texture = sample_texture_bilinear;
    else
@@ -329,26 +329,25 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
-   uint i;
+   const uint unit = texture->unit;
+   const uint width = texture->width;
+   const uint height = texture->height;
 
-   if (1||Debug) {
-      printf("SPU %u: TEXTURE\n", spu.init.id);
-      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-         printf("  %d: at %p  size %u x %u\n", i, texture->texture[i].start,
-                texture->texture[i].width, texture->texture[i].height);
-      }
+   if (Debug) {
+      printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
+             texture->unit, texture->start,
+             texture->width, texture->height);
    }
 
-   memcpy(&spu.texture, texture, sizeof(*texture));
-   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      const uint width = texture->texture[i].width;
-      const uint height = texture->texture[i].height;
-      spu.tex_size[i] = (vector float) { width, height, 0.0, 0.0};
-      spu.tex_size_mask[i] = (vector unsigned int)
+   spu.texture[unit].start = texture->start;
+   spu.texture[unit].width = width;
+   spu.texture[unit].height = height;
+
+   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
+   spu.texture[unit].tex_size_mask = (vector unsigned int)
          { width - 1, height - 1, 0, 0 };
-      spu.tex_size_x_mask[i] = spu_splats(width - 1);
-      spu.tex_size_y_mask[i] = spu_splats(height - 1);
-   }
+   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
+   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
 }
 
 
@@ -480,12 +479,20 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
-         cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8);
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 8;
+         }
          break;
       case CELL_CMD_STATE_TEXTURE:
-         cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8);
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 8;
+         }
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 8a87787537..2bfad3535a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -100,6 +100,17 @@ struct spu_framebuffer {
 } ALIGN16_ATTRIB;
 
 
+struct spu_texture
+{
+   void *start;
+   uint width, height;
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+} ALIGN16_ATTRIB;
+
+
 /**
  * All SPU global/context state will be in singleton object of this type:
  */
@@ -119,7 +130,7 @@ struct spu_global
    logicop_func logicop;
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
-   struct cell_command_texture texture;
+   struct spu_texture texture[PIPE_MAX_SAMPLERS];
 
    struct vertex_info vertex_info;
 
@@ -141,11 +152,6 @@ struct spu_global
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
 
-   vector float tex_size[CELL_MAX_SAMPLERS];
-   vector unsigned int tex_size_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask[CELL_MAX_SAMPLERS]; /**< == int(size - 1) */
-
    vector float (*sample_texture)(vector float texcoord);
 
 } ALIGN16_ATTRIB;
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 91a6aec5ec..4612501eb3 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -41,10 +41,10 @@ void
 invalidate_tex_cache(void)
 {
    uint unit = 0;
-   uint bytes = 4 * spu.texture.texture[unit].width
-      * spu.texture.texture[unit].height;
+   uint bytes = 4 * spu.texture[unit].width
+      * spu.texture[unit].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture.texture[unit].start, bytes);
+   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
 }
 
 
@@ -55,14 +55,14 @@ get_texel(vec_uint4 coordinate)
    vec_uint4 tmp;
    unsigned x = spu_extract(coordinate, 0);
    unsigned y = spu_extract(coordinate, 1);
-   const unsigned tiles_per_row = spu.texture.texture[unit].width / TILE_SIZE;
+   const unsigned tiles_per_row = spu.texture[unit].width / TILE_SIZE;
    unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
                                             + (x / TILE_SIZE));
    unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
                                 + (x % TILE_SIZE));
 
    spu_dcache_fetch_unaligned((qword *) & tmp,
-                              spu.texture.texture[unit].start + tile_offset + texel_offset,
+                              spu.texture[unit].start + tile_offset + texel_offset,
                               4);
    return spu_extract(tmp, 0);
 }
@@ -72,13 +72,13 @@ static void
 get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
    const uint unit = 0;
-   const unsigned texture_ea = (uintptr_t) spu.texture.texture[unit].start;
+   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
    vec_uint4 tile_x = spu_rlmask(x, -5);
    vec_uint4 tile_y = spu_rlmask(y, -5);
    const qword offset_x = si_andi((qword) x, 0x1f);
    const qword offset_y = si_andi((qword) y, 0x1f);
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture.texture[unit].width / TILE_SIZE);
+   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].width / TILE_SIZE);
    const qword tile_size = (qword) spu_splats(sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -107,9 +107,9 @@ vector float
 sample_texture_nearest(vector float texcoord)
 {
    const uint unit = 0;
-   vector float tc = spu_mul(texcoord, spu.tex_size[unit]);
+   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.tex_size_mask[unit]);        /* mask (GL_REPEAT) */
+   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
    uint texel = get_texel(itc);
    return spu_unpack_A8R8G8B8(texel);
 }
@@ -122,7 +122,7 @@ sample_texture_bilinear(vector float texcoord)
    static const vec_uint4 offset_x = {0, 0, 1, 1};
    static const vec_uint4 offset_y = {0, 1, 0, 1};
 
-   vector float tc = spu_mul(texcoord, spu.tex_size[unit]);
+   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
    tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
 
    /* integer texcoords S,T: */
@@ -136,8 +136,8 @@ sample_texture_bilinear(vector float texcoord)
    x = spu_add(x, offset_x);
    y = spu_add(y, offset_y);
 
-   x = spu_and(x, spu.tex_size_x_mask[unit]);
-   y = spu_and(y, spu.tex_size_y_mask[unit]);
+   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
+   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
 
    get_four_texels(x, y, texels);
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 9f63317b1f..17e337bbdf 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -309,7 +309,7 @@ emit_quad( int x, int y, mask_t mask )
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture.texture[0].start) {
+      if (spu.texture[0].start) {
          /* texture mapping */
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
-- 
cgit v1.2.3


From e7b23d36df1ab3ac5b54ef8e4e56c4fd46db8257 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 1 Apr 2008 11:35:53 -0600
Subject: cell: checkpoint: more multi-texture work

---
 src/gallium/drivers/cell/ppu/cell_texture.c |  8 +++++--
 src/gallium/drivers/cell/spu/spu_main.c     |  2 +-
 src/gallium/drivers/cell/spu/spu_main.h     |  2 +-
 src/gallium/drivers/cell/spu/spu_texture.c  |  6 ++---
 src/gallium/drivers/cell/spu/spu_texture.h  |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.c      | 34 +++++++++++++++++++++++++----
 6 files changed, 42 insertions(+), 14 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9c694e136d..c59d1f7f23 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -245,9 +245,13 @@ void
 cell_update_texture_mapping(struct cell_context *cell)
 {
    uint face = 0, level = 0, zslice = 0;
+   uint i;
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      if (cell->texture[i])
+         cell_tile_texture(cell, cell->texture[i]);
+   }
 
-   if (cell->texture[0])
-      cell_tile_texture(cell, cell->texture[0]);
 #if 0
    if (cell->tex_surf && cell->tex_map) {
       pipe_surface_unmap(cell->tex_surf);
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 5b5a570a3c..a840d01596 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -333,7 +333,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint width = texture->width;
    const uint height = texture->height;
 
-   if (Debug) {
+   if (1||Debug) {
       printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
              texture->unit, texture->start,
              texture->width, texture->height);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2bfad3535a..26e050cfc3 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -152,7 +152,7 @@ struct spu_global
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
 
-   vector float (*sample_texture)(vector float texcoord);
+   vector float (*sample_texture)(uint unit, vector float texcoord);
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 4612501eb3..58a426cc40 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -104,9 +104,8 @@ get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
  * XXX this is extremely primitive for now.
  */
 vector float
-sample_texture_nearest(vector float texcoord)
+sample_texture_nearest(uint unit, vector float texcoord)
 {
-   const uint unit = 0;
    vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
    itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
@@ -116,9 +115,8 @@ sample_texture_nearest(vector float texcoord)
 
 
 vector float
-sample_texture_bilinear(vector float texcoord)
+sample_texture_bilinear(uint unit, vector float texcoord)
 {
-   const uint unit = 0;
    static const vec_uint4 offset_x = {0, 0, 1, 1};
    static const vec_uint4 offset_y = {0, 1, 0, 1};
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 95eb87080f..f7c9738be8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -37,11 +37,11 @@ invalidate_tex_cache(void);
 
 
 extern vector float
-sample_texture_nearest(vector float texcoord);
+sample_texture_nearest(uint unit, vector float texcoord);
 
 
 extern vector float
-sample_texture_bilinear(vector float texcoord);
+sample_texture_bilinear(uint unit, vector float texcoord);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 17e337bbdf..51abcb1757 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -311,17 +311,43 @@ emit_quad( int x, int y, mask_t mask )
 
       if (spu.texture[0].start) {
          /* texture mapping */
+         const uint unit = 0;
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture(texcoords[0]);
+            colors[0] = spu.sample_texture(unit, texcoords[0]);
          if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture(texcoords[1]);
+            colors[1] = spu.sample_texture(unit, texcoords[1]);
          if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture(texcoords[2]);
+            colors[2] = spu.sample_texture(unit, texcoords[2]);
          if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture(texcoords[3]);
+            colors[3] = spu.sample_texture(unit, texcoords[3]);
+
+
+         if (spu.texture[1].start) {
+            /* multi-texture mapping */
+            const uint unit = 1;
+            vector float colors1[4];
+
+            eval_coeff(3, (float) x, (float) y, texcoords);
+
+            if (spu_extract(mask, 0))
+               colors1[0] = spu.sample_texture(unit, texcoords[0]);
+            if (spu_extract(mask, 1))
+               colors1[1] = spu.sample_texture(unit, texcoords[1]);
+            if (spu_extract(mask, 2))
+               colors1[2] = spu.sample_texture(unit, texcoords[2]);
+            if (spu_extract(mask, 3))
+               colors1[3] = spu.sample_texture(unit, texcoords[3]);
+
+            /* hack: modulate first texture by second */
+            colors[0] = spu_mul(colors[0], colors1[0]);
+            colors[1] = spu_mul(colors[1], colors1[1]);
+            colors[2] = spu_mul(colors[2], colors1[2]);
+            colors[3] = spu_mul(colors[3], colors1[3]);
+         }
+
       }
       else {
          /* simple shading */
-- 
cgit v1.2.3


From bccd3f138ccc717fad9073110d15e3321b590476 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 1 Apr 2008 15:42:42 -0600
Subject: cell: more multi-texture fixes (mostly working now)

---
 src/gallium/drivers/cell/spu/spu_main.c    |  6 +++---
 src/gallium/drivers/cell/spu/spu_main.h    |  4 +++-
 src/gallium/drivers/cell/spu/spu_texture.c |  5 ++---
 src/gallium/drivers/cell/spu/spu_tri.c     | 18 +++++++++---------
 4 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 5b5a570a3c..1ab1c40379 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -319,10 +319,10 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
              spu.init.id, sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
-      spu.sample_texture = sample_texture_bilinear;
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
    else
-      spu.sample_texture = sample_texture_nearest;
+      spu.sample_texture[sampler->unit] = sample_texture_nearest;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 26e050cfc3..e9e39cbeab 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -85,6 +85,8 @@ typedef struct spu_blend_results (*logicop_func)(
     qword frag_mask);
 
 
+typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -152,7 +154,7 @@ struct spu_global
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
 
-   vector float (*sample_texture)(uint unit, vector float texcoord);
+   sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index ba0d57b959..ceab246980 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -68,9 +68,8 @@ get_texel(uint unit, vec_uint4 coordinate)
 
 
 static void
-get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
-   const uint unit = 0;
    const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
    vec_uint4 tile_x = spu_rlmask(x, -5);
    vec_uint4 tile_y = spu_rlmask(y, -5);
@@ -136,7 +135,7 @@ sample_texture_bilinear(uint unit, vector float texcoord)
    x = spu_and(x, spu.texture[unit].tex_size_x_mask);
    y = spu_and(y, spu.texture[unit].tex_size_y_mask);
 
-   get_four_texels(x, y, texels);
+   get_four_texels(unit, x, y, texels);
 
    vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
    vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 51abcb1757..ab4ff8160a 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -316,13 +316,13 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture(unit, texcoords[0]);
+            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
          if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture(unit, texcoords[1]);
+            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
          if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture(unit, texcoords[2]);
+            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
          if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture(unit, texcoords[3]);
+            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
 
 
          if (spu.texture[1].start) {
@@ -330,16 +330,16 @@ emit_quad( int x, int y, mask_t mask )
             const uint unit = 1;
             vector float colors1[4];
 
-            eval_coeff(3, (float) x, (float) y, texcoords);
+            eval_coeff(2, (float) x, (float) y, texcoords);
 
             if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture(unit, texcoords[0]);
+               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
             if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture(unit, texcoords[1]);
+               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
             if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture(unit, texcoords[2]);
+               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
             if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture(unit, texcoords[3]);
+               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
 
             /* hack: modulate first texture by second */
             colors[0] = spu_mul(colors[0], colors1[0]);
-- 
cgit v1.2.3


From 217d37940771dd02ff1aa365105eca2c7a09d623 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 2 Apr 2008 14:01:42 -0600
Subject: cell: minor texture improvements

Precompute tiles_per_row.  Use ushort multiplies in a few places.  New comments.
---
 src/gallium/drivers/cell/spu/spu_main.c    |  2 ++
 src/gallium/drivers/cell/spu/spu_main.h    |  3 ++-
 src/gallium/drivers/cell/spu/spu_texture.c | 32 ++++++++++++++++++++----------
 3 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 1ab1c40379..e04ffeb9b1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -343,6 +343,8 @@ cmd_state_texture(const struct cell_command_texture *texture)
    spu.texture[unit].width = width;
    spu.texture[unit].height = height;
 
+   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+
    spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
    spu.texture[unit].tex_size_mask = (vector unsigned int)
          { width - 1, height - 1, 0, 0 };
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e9e39cbeab..e962e1426c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -105,7 +105,8 @@ struct spu_framebuffer {
 struct spu_texture
 {
    void *start;
-   uint width, height;
+   ushort width, height;
+   ushort tiles_per_row;
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
    vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index ceab246980..e9a2754e57 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -51,22 +51,25 @@ invalidate_tex_cache(void)
 static uint
 get_texel(uint unit, vec_uint4 coordinate)
 {
+   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   ushort x = spu_extract(coordinate, 0);
+   ushort y = spu_extract(coordinate, 1);
+   unsigned tile_offset = sizeof(tile_t)
+      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
+   ushort texel_offset = (ushort) 4
+      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
    vec_uint4 tmp;
-   unsigned x = spu_extract(coordinate, 0);
-   unsigned y = spu_extract(coordinate, 1);
-   const unsigned tiles_per_row = spu.texture[unit].width / TILE_SIZE;
-   unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
-                                            + (x / TILE_SIZE));
-   unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
-                                + (x % TILE_SIZE));
 
    spu_dcache_fetch_unaligned((qword *) & tmp,
-                              spu.texture[unit].start + tile_offset + texel_offset,
+                              texture_ea + tile_offset + texel_offset,
                               4);
    return spu_extract(tmp, 0);
 }
 
 
+/**
+ * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ */
 static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
@@ -76,7 +79,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    const qword offset_x = si_andi((qword) x, 0x1f);
    const qword offset_y = si_andi((qword) y, 0x1f);
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].width / TILE_SIZE);
+   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
    const qword tile_size = (qword) spu_splats(sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -97,6 +100,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
                               texture_ea + spu_extract(offset, 3), 4);
 }
 
+
 /**
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
@@ -126,17 +130,25 @@ sample_texture_bilinear(uint unit, vector float texcoord)
 
    vec_uint4 texels[4];
    
+   /* setup texcoords for quad:
+    *  +-----+-----+
+    *  |x0,y0|x1,y1|
+    *  +-----+-----+
+    *  |x2,y2|x3,y3|
+    *  +-----+-----+
+    */
    vec_uint4 x = spu_splats(spu_extract(itc, 0));
    vec_uint4 y = spu_splats(spu_extract(itc, 1));
-
    x = spu_add(x, offset_x);
    y = spu_add(y, offset_y);
 
+   /* GL_REPEAT wrap mode: */
    x = spu_and(x, spu.texture[unit].tex_size_x_mask);
    y = spu_and(y, spu.texture[unit].tex_size_y_mask);
 
    get_four_texels(unit, x, y, texels);
 
+   /* integer A8R8G8B8 to float texel conversion */
    vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
    vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
    vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-- 
cgit v1.2.3


From 8af4794afcfa04351d4131d826daeb1312634f82 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 5 Sep 2008 10:18:00 -0600
Subject: cell: code clean-up, comments

---
 src/gallium/drivers/cell/spu/spu_main.c | 67 +++++++++++++++++++--------------
 src/gallium/drivers/cell/spu/spu_main.h |  8 ++--
 2 files changed, 43 insertions(+), 32 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 0d4cdfae85..d223f32d94 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -55,6 +55,10 @@ struct spu_global spu;
 
 struct spu_vs_context draw;
 
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
 static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
     ALIGN16_ATTRIB;
 
@@ -332,6 +336,21 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
 }
 
 
+static void
+cmd_state_logicop(const struct cell_command_logicop * code)
+{
+   mfc_get(logicop_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   spu.logicop = (logicop_func) logicop_code_buffer;
+}
+
+
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -401,6 +420,21 @@ cmd_state_vs_array_info(const struct cell_array_info *vs_info)
 }
 
 
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
 static void
 cmd_finish(void)
 {
@@ -539,38 +573,15 @@ cmd_batch(uint opcode)
                                 (struct cell_shader_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
          break;
-      case CELL_CMD_STATE_ATTRIB_FETCH: {
-         struct cell_attribute_fetch_code *code =
-             (struct cell_attribute_fetch_code *) &buffer[pos+1];
-
-              mfc_get(attribute_fetch_code_buffer,
-                      (unsigned int) code->base,  /* src */
-                      code->size,
-                      TAG_BATCH_BUFFER,
-                      0, /* tid */
-                      0  /* rid */);
-         wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-         draw.vertex_fetch.code = attribute_fetch_code_buffer;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
-      }
-      case CELL_CMD_STATE_LOGICOP: {
-         struct cell_command_logicop *code =
-             (struct cell_command_logicop *) &buffer[pos+1];
-
-              mfc_get(logicop_code_buffer,
-                      (unsigned int) code->base,  /* src */
-                      code->size,
-                      TAG_BATCH_BUFFER,
-                      0, /* tid */
-                      0  /* rid */);
-         wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-	 spu.logicop = (logicop_func) logicop_code_buffer;
+      case CELL_CMD_STATE_LOGICOP:
+         cmd_state_logicop((struct cell_command_logicop *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
          break;
-      }
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e962e1426c..4879f8c9c8 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -124,13 +124,13 @@ struct spu_global
    struct spu_framebuffer fb;
    boolean read_depth;
    boolean read_stencil;
-   frag_test_func frag_test;
+   frag_test_func frag_test;  /**< Current depth/stencil test code */
    
-   boolean read_fb;
-   blend_func blend;
+   boolean read_fb;   /**< Does current blend mode require framebuffer read? */
+   blend_func blend;  /**< Current blend code */
    qword const_blend_color[4] ALIGN16_ATTRIB;
 
-   logicop_func logicop;
+   logicop_func logicop;  /**< Current logicop code **/
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-- 
cgit v1.2.3


From 04ae4fba3c0a656cf2747fc994b99f99576d0e2b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 8 Sep 2008 11:53:14 -0600
Subject: cell: minor change to Z float/int conversion code (avoid switch)

---
 src/gallium/drivers/cell/spu/spu_main.c            |  5 ++++
 src/gallium/drivers/cell/spu/spu_main.h            |  5 ++++
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 34 +++++++++-------------
 3 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index d223f32d94..c4236817a9 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -252,12 +252,17 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 
    switch (spu.fb.depth_format) {
    case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_S8Z24_UNORM:
       spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
       break;
    case PIPE_FORMAT_Z16_UNORM:
       spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
       break;
    default:
       spu.fb.zsize = 0;
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 4879f8c9c8..c2a53c9dcf 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,10 @@
 #define MAX_HEIGHT 1024
 
 
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
 typedef union {
    ushort us[TILE_SIZE][TILE_SIZE];
    uint   ui[TILE_SIZE][TILE_SIZE];
@@ -99,6 +103,7 @@ struct spu_framebuffer {
    uint depth_clear_value;
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index db88735226..29dc07a2e8 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -144,18 +144,22 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
 
    case PIPE_FORMAT_Z24S8_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      /* form select mask = 1110,1110,1110,1110 */
       qword mask = si_fsmbi(0xEEEE);
-
+      /* depth[i] = depth[i] << 8 */
       depth = si_shli(depth, 8);
+      /* *ptr[i] = depth[i][31:8] | stencil[i][7:0] */
       *ptr = si_selb(stencil, depth, mask);
       break;
    }
 
    case PIPE_FORMAT_S8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
+      /* form select mask = 0111,0111,0111,0111 */
       qword mask = si_fsmbi(0x7777);
-
+      /* stencil[i] = stencil[i] << 24 */
       stencil = si_shli(stencil, 24);
+      /* *ptr[i] = stencil[i][31:24] | depth[i][23:0] */
       *ptr = si_selb(stencil, depth, mask);
       break;
    }
@@ -191,25 +195,13 @@ spu_do_depth_stencil(int x, int y,
       read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
                    &pixel_depth, &pixel_stencil);
    }
-   
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   default:
-      ASSERT(0);
-      break;
-   }
+
+   /* convert floating point Z values to 32-bit uint */
+
+   /* frag_depth *= spu.fb.zscale */
+   frag_depth = si_fm(frag_depth, (qword)spu_splats(spu.fb.zscale));
+   /* frag_depth = uint(frag_depth) */
+   frag_depth = si_cfltu(frag_depth, 0);
 
    result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
                              frag_depth, frag_alpha, facing);
-- 
cgit v1.2.3


From 284ab5a6127f8b452acaa0e10ac1d9ebc87fac3e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 10 Sep 2008 18:22:00 -0600
Subject: cell: checkpoint commit of new per-fragment processing

Do code generation for alpha test, z test, stencil, blend, colormask
and framebuffer/tile read/write as a single code block.
Ian's previous blend/z/stencil test code is still there but mostly disabled
and will be removed soon.
---
 src/gallium/drivers/cell/common.h                  |  20 +-
 src/gallium/drivers/cell/ppu/Makefile              |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 530 +++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_gen_fragment.h   |  38 ++
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  31 +-
 .../drivers/cell/ppu/cell_state_per_fragment.c     |   2 +-
 src/gallium/drivers/cell/spu/Makefile              |   2 +-
 src/gallium/drivers/cell/spu/spu_main.c            |  53 ++-
 src/gallium/drivers/cell/spu/spu_main.h            |  23 +
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 231 ++++++++-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |  11 +
 src/gallium/drivers/cell/spu/spu_tri.c             |  30 ++
 src/gallium/winsys/xlib/xm_api.c                   |   7 +-
 src/gallium/winsys/xlib/xm_winsys.c                |  35 ++
 14 files changed, 998 insertions(+), 16 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fragment.h

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c0ca201e1d..a62530c64d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -97,6 +97,7 @@
 #define CELL_CMD_STATE_LOGICOP       21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_FRAGMENT_OPS  24
 
 
 #define CELL_NUM_BUFFERS 4
@@ -112,30 +113,43 @@
 
 /**
  */
-struct cell_command_depth_stencil_alpha_test {
+struct cell_command_depth_stencil_alpha_test
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
    unsigned read_depth;         /**< Flag: should depth be read? */
    unsigned read_stencil;       /**< Flag: should stencil be read? */
+   struct pipe_depth_stencil_alpha_state state;
 };
 
 
 /**
  * Upload code to perform framebuffer blend operation
  */
-struct cell_command_blend {
+struct cell_command_blend
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
    unsigned read_fb;            /**< Flag: should framebuffer be read? */
 };
 
 
-struct cell_command_logicop {
+struct cell_command_logicop
+{
    uint64_t base;               /**< Effective address of code start. */
    unsigned size;               /**< Size in bytes of SPE code. */
 };
 
 
+#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+
+struct cell_command_fragment_ops
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 25473e200c..b5a6fcb8de 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -25,6 +25,7 @@ SOURCES = \
 	cell_context.c \
 	cell_draw_arrays.c \
 	cell_flush.c \
+	cell_gen_fragment.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
 	cell_state_per_fragment.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 0000000000..df29476be6
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,530 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ */
+static void
+gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+   }
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param f     the generated function (out)
+ */
+void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa =
+      &cell->depth_stencil->base;
+   const struct pipe_blend_state *blend = &cell->blend->base;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+   fbZS_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+      boolean write_depth_stencil;
+
+      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
+      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+
+      /* fetch quad of depth/stencil values from tile at (x,y) */
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      if (dsa->depth.enabled) {
+         /* Extract Z bits from fbZS_reg into fbZ_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            int mask_reg = spe_allocate_available_register(f);
+            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
+            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
+            spe_release_register(f, mask_reg);
+            /* OK, fbZ_reg has four 24-bit Z values now */
+         }
+         else {
+            /* XXX handle other z/stencil formats */
+            ASSERT(0);
+         }
+
+         /* Convert fragZ values from float[4] to uint[4] */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* 24-bit Z values */
+            int scale_reg = spe_allocate_available_register(f);
+
+            /* scale_reg[0,1,2,3] = float(2^24-1) */
+            spe_load_float(f, scale_reg, (float) 0xffffff);
+
+            /* XXX these two instructions might be combined */
+            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
+
+            spe_release_register(f, scale_reg);
+         }
+         else {
+            /* XXX handle 16-bit Z format */
+            ASSERT(0);
+         }
+      }
+
+      if (dsa->stencil[0].enabled) {
+         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX extract with a shift */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* XXX extract with a mask */
+            ASSERT(0);
+         }
+      }
+
+
+      if (dsa->stencil[0].enabled) {
+         /* XXX this may involve depth testing too */
+         // gen_stencil_test(dsa, f, ... );
+         ASSERT(0);
+      }
+      else if (dsa->depth.enabled) {
+         int zmask_reg = spe_allocate_available_register(f);
+         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_release_register(f, zmask_reg);
+      }
+
+      /* do we need to write Z and/or Stencil back into framebuffer? */
+      write_depth_stencil = (dsa->depth.writemask |
+                             dsa->stencil[0].write_mask |
+                             dsa->stencil[1].write_mask);
+
+      if (write_depth_stencil) {
+         /* Merge latest Z and Stencil values into fbZS_reg.
+          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+          * fbS_reg has four 8-bit Z values in bits [7..0].
+          */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else {
+            /* bad zs_format */
+            ASSERT(0);
+         }
+
+         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+      }
+
+      spe_release_register(f, fbZ_reg);
+      spe_release_register(f, fbS_reg);
+   }
+
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+
+   if (blend->blend_enable) {
+      /* convert packed tile colors in fbRGBA_reg to float[4] vectors */
+
+      // gen_blend_code(blend, f, mask_reg, ... );
+
+   }
+
+
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+      spe_cfltu(f, fragR_reg, fragR_reg, 32);
+      spe_cfltu(f, fragG_reg, fragG_reg, 32);
+      spe_cfltu(f, fragB_reg, fragB_reg, 32);
+      spe_cfltu(f, fragA_reg, fragA_reg, 32);
+
+      /* Shift most the significant bytes to least the significant positions.
+       * I.e.: reg = reg >> 24
+       */
+      spe_rotmi(f, fragR_reg, fragR_reg, -24);
+      spe_rotmi(f, fragG_reg, fragG_reg, -24);
+      spe_rotmi(f, fragB_reg, fragB_reg, -24);
+      spe_rotmi(f, fragA_reg, fragA_reg, -24);
+
+      /* Shift the color bytes according to the surface format */
+      if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+         spe_roti(f, fragG_reg, fragG_reg, 8);   /* green <<= 8 */
+         spe_roti(f, fragR_reg, fragR_reg, 16);  /* red <<= 16 */
+         spe_roti(f, fragA_reg, fragA_reg, 24);  /* alpha <<= 24 */
+      }
+      else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+         spe_roti(f, fragR_reg, fragR_reg, 8);   /* red <<= 8 */
+         spe_roti(f, fragG_reg, fragG_reg, 16);  /* green <<= 16 */
+         spe_roti(f, fragB_reg, fragB_reg, 24);  /* blue <<= 24 */
+      }
+      else {
+         ASSERT(0);
+      }
+
+      /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+       * Eg: after shifting according to color_format we might have:
+       *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+       *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+       *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+       *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+       * OR-ing all those together gives us four packed colors:
+       *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+       */
+      spe_or(f, rgba_reg, fragR_reg, fragG_reg);
+      spe_or(f, rgba_reg, rgba_reg, fragB_reg);
+      spe_or(f, rgba_reg, rgba_reg, fragA_reg);
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, quad_offset_reg);
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
new file mode 100644
index 0000000000..0ea0fc690c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
+
+
+extern void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+
+
+#endif /* CELL_GEN_FRAGMENT_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index f2feaa329a..06777aac14 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -27,6 +27,7 @@
 
 #include "util/u_memory.h"
 #include "cell_context.h"
+#include "cell_gen_fragment.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_state_per_fragment.h"
@@ -83,6 +84,29 @@ cell_emit_state(struct cell_context *cell)
       fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
       fb->width = cell->framebuffer.width;
       fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", pf_name(fb->color_format));
+      printf("EMIT depth format %s\n", pf_name(fb->depth_format));
+#endif
+   }
+
+
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) {
+      /* XXX we don't want to always do codegen here.  We should have
+       * a hash/lookup table to cache previous results...
+       */
+      struct cell_command_fragment_ops *fops
+            = cell_batch_alloc(cell, sizeof(*fops));
+      struct spe_function spe_code;
+
+      /* generate new code */
+      gen_fragment_function(cell, &spe_code);
+      /* put the new code into the batch buffer */
+      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      memcpy(&fops->code, spe_code.store,
+             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      /* free codegen buffer */
+      spe_release_func(&spe_code);
    }
 
    if (cell->dirty & CELL_NEW_BLEND) {
@@ -90,8 +114,7 @@ cell_emit_state(struct cell_context *cell)
 
       if (cell->blend != NULL) {
          blend.base = (intptr_t) cell->blend->code.store;
-         blend.size = (char *) cell->blend->code.csr
-             - (char *) cell->blend->code.store;
+         blend.size = cell->blend->code.num_inst * SPE_INST_SIZE;
          blend.read_fb = TRUE;
       }
       else {
@@ -108,10 +131,10 @@ cell_emit_state(struct cell_context *cell)
 
       if (cell->depth_stencil != NULL) {
 	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
-	 dsat.size = (char *) cell->depth_stencil->code.csr
-	     - (char *) cell->depth_stencil->code.store;
+	 dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE;
 	 dsat.read_depth = TRUE;
 	 dsat.read_stencil = FALSE;
+         dsat.state = cell->depth_stencil->base;
       }
       else {
 	 dsat.base = 0;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index 705867107b..78cb446c14 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -1158,7 +1158,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
 static int
 PC_OFFSET(const struct spe_function *f, const void *d)
 {
-   const intptr_t pc = (intptr_t) f->csr;
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
    const intptr_t ea = ~0x0f & (intptr_t) d;
 
    return (ea - pc) >> 2;
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index d49abb2e82..e285ae9fdb 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -43,7 +43,7 @@ INCLUDE_DIRS = \
 	$(SPU_CC) $(SPU_CFLAGS) -c $<
 
 .c.s:
-	$(SPU_CC) $(SPU_CFLAGS) -S $<
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
 
 
 # The .a file will be linked into the main/PPU executable
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index c4236817a9..4e0ec15925 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -34,6 +34,7 @@
 
 #include "spu_main.h"
 #include "spu_render.h"
+#include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
 //#include "spu_test.h"
@@ -46,7 +47,7 @@
 /*
 helpful headers:
 /usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
-/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
+/opt/cell/sdk/usr/include/libmisc.h
 */
 
 boolean Debug = FALSE;
@@ -226,6 +227,24 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 }
 
 
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   if (Debug)
+      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Point function pointer at new code */
+   spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -257,6 +276,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       spu.fb.zsize = 4;
       spu.fb.zscale = (float) 0x00ffffffu;
       break;
@@ -282,6 +303,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+#define NEW_FRAGMENT_FUNCTION 01
+
 static void
 cmd_state_blend(const struct cell_command_blend *state)
 {
@@ -302,7 +325,9 @@ cmd_state_blend(const struct cell_command_blend *state)
       wait_on_mask(1 << TAG_BATCH_BUFFER);
       spu.blend = (blend_func) fb_blend_code_buffer;
       spu.read_fb = state->read_fb;
-   } else {
+   }
+   else
+   {
       spu.read_fb = FALSE;
    }
 }
@@ -326,7 +351,9 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
 	      0, /* tid */
 	      0  /* rid */);
       wait_on_mask(1 << TAG_BATCH_BUFFER);
-   } else {
+   }
+   else
+   {
       /* If there is no code, emit a return instruction.
        */
       depth_stencil_code_buffer[0] = 0x35;
@@ -338,12 +365,14 @@ cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *stat
    spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
    spu.read_depth = state->read_depth;
    spu.read_stencil = state->read_stencil;
+   spu.depth_stencil_alpha = state->state;
 }
 
 
 static void
 cmd_state_logicop(const struct cell_command_logicop * code)
 {
+#if !NEW_FRAGMENT_FUNCTION
    mfc_get(logicop_code_buffer,
            (unsigned int) code->base,  /* src */
            code->size,
@@ -353,6 +382,7 @@ cmd_state_logicop(const struct cell_command_logicop * code)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    spu.logicop = (logicop_func) logicop_code_buffer;
+#endif
 }
 
 
@@ -455,7 +485,9 @@ cmd_finish(void)
 
 
 /**
- * Execute a batch of commands
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
  * The opcode param encodes the location of the buffer and its size.
  */
 static void
@@ -519,6 +551,14 @@ cmd_batch(uint opcode)
             pos += pos_incr;
          }
          break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
+         break;
       case CELL_CMD_RELEASE_VERTS:
          {
             struct cell_command_release_verts *release
@@ -680,6 +720,11 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
+
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function.
+    */
+   spu.fragment_ops.func = spu_fallback_fragment_ops;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index c2a53c9dcf..7ab34f5222 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -91,6 +91,24 @@ typedef struct spu_blend_results (*logicop_func)(
 
 typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
 
+
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask);
+
+struct spu_fragment_ops
+{
+   uint code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   spu_fragment_ops_func func;  /**< Current fragment ops function */
+} ALIGN16_ATTRIB;
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -127,6 +145,9 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
+
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+
    boolean read_depth;
    boolean read_stencil;
    frag_test_func frag_test;  /**< Current depth/stencil test code */
@@ -142,6 +163,8 @@ struct spu_global
 
    struct vertex_info vertex_info;
 
+   struct spu_fragment_ops fragment_ops;
+
    /* XXX more state to come */
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 29dc07a2e8..ffc596aa62 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -29,8 +29,11 @@
  * \author Ian Romanick <idr@us.ibm.com>
  */
 
+
+#include <transpose_matrix4x4.h>
 #include "pipe/p_format.h"
 #include "spu_main.h"
+#include "spu_colorpack.h"
 #include "spu_per_fragment_op.h"
 
 #define ZERO 0x80
@@ -90,7 +93,8 @@ read_ds_quad(tile_t *tile, unsigned x, unsigned y,
       break;
    }
 
-   case PIPE_FORMAT_S8Z24_UNORM: {
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM: {
       qword *ptr = (qword *) &tile->ui4[iy][ix];
 
       *depth = si_and(*ptr, si_fsmbi(0x7777));
@@ -153,7 +157,8 @@ write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
       break;
    }
 
-   case PIPE_FORMAT_S8Z24_UNORM: {
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM: {
       qword *ptr = (qword *) &buffer->ui4[iy][ix];
       /* form select mask = 0111,0111,0111,0111 */
       qword mask = si_fsmbi(0x7777);
@@ -217,3 +222,225 @@ spu_do_depth_stencil(int x, int y,
 
    return result.mask;
 }
+
+
+
+
+/**
+ * Called by rasterizer for each quad after the shader has run.  This
+ * is a fallback/debug function.  In reality we'll use a generated
+ * function produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask)
+{
+   vector float frag_soa[4], frag_aos[4];
+   unsigned int c0, c1, c2, c3;
+
+   /* do alpha test */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragAlpha);  /* mask = (fragAlpha < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragAlpha, ref);  /* mask = (fragAlpha > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragAlpha);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragAlpha, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragAlpha);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragAlpha);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
+   }
+
+   /* Z and/or stencil testing... */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
+                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+   }
+
+   /* XXX do blending here */
+
+   /* XXX do colormask test here */
+
+
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      return;
+   }
+
+   /* convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA */
+#if 0
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragRed;
+      frag_soa[1] = fragGreen;
+      frag_soa[2] = fragBlue;
+      frag_soa[3] = fragAlpha;
+      _transpose_matrix4x4(frag_aos, frag_soa);
+   }
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragRed);
+   (void) fragGreen;
+   (void) fragBlue;
+#endif
+
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      break;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
+   }
+
+#if 0
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|
+    *  +--+--+
+    *  |p2|p3|
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = c3;   
+#else
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = c3;   
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index 6571258699..ffadf0661c 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -29,4 +29,15 @@ extern qword
 spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
 		     qword frag_alpha, qword facing);
 
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask);
+
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a3ea0a3e69..71ef6ca24f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -297,9 +297,12 @@ emit_quad( int x, int y, mask_t mask )
    sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
 
+#define NEW_FRAGMENT_FUNCTION 01
+#if !NEW_FRAGMENT_FUNCTION
    if (spu.read_depth) {
       mask = do_depth_test(x, y, mask);
    }
+#endif
 
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -308,6 +311,7 @@ emit_quad( int x, int y, mask_t mask )
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture[0].start) {
          /* texture mapping */
@@ -355,6 +359,29 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
+#if NEW_FRAGMENT_FUNCTION
+      {
+         /* Convert fragment data from AoS to SoA format.
+          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+          * This is temporary!
+          */
+         vector float soa_frag[4];
+         _transpose_matrix4x4(soa_frag, colors);
+
+         float4 fragZ;
+
+         fragZ.v = eval_z((float) x, (float) y);
+
+         /* Do all per-fragment/quad operations here, including:
+          *  alpha test, z test, stencil test, blend and framebuffer writing.
+          */
+         spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile,
+                               fragZ.v,
+                               soa_frag[0], soa_frag[1],
+                               soa_frag[2], soa_frag[3],
+                               mask);
+      }
+#else
       /* Convert fragment data from AoS to SoA format.
        * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
        */
@@ -405,6 +432,9 @@ emit_quad( int x, int y, mask_t mask )
       spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
       spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
       spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
+
+#endif /* NEW_FRAGMENT_FUNCTION */
+
    }
 #endif
 }
diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c
index b010513107..28bd6ceab4 100644
--- a/src/gallium/winsys/xlib/xm_api.c
+++ b/src/gallium/winsys/xlib/xm_api.c
@@ -349,12 +349,17 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type,
 
    if (vis->mesa_visual.depthBits == 0)
       depthFormat = PIPE_FORMAT_NONE;
+#ifdef GALLIUM_CELL /* XXX temporary for Cell! */
+   else
+      depthFormat = PIPE_FORMAT_S8Z24_UNORM;
+#else
    else if (vis->mesa_visual.depthBits <= 16)
-      depthFormat = PIPE_FORMAT_Z16_UNORM;
+      depthFormat = PIPE_FORMAT_Z16UNORM;
    else if (vis->mesa_visual.depthBits <= 24)
       depthFormat = PIPE_FORMAT_S8Z24_UNORM;
    else
       depthFormat = PIPE_FORMAT_Z32_UNORM;
+#endif
 
    if (vis->mesa_visual.stencilBits == 8) {
       if (depthFormat == PIPE_FORMAT_S8Z24_UNORM)
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 5e9a1f92f1..c4a30d3702 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -275,6 +275,39 @@ xm_buffer_destroy(struct pipe_winsys *pws,
 }
 
 
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(uint *tile)
+{
+   uint tile2[TILE_SIZE * TILE_SIZE];
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tile2[y * TILE_SIZE + (x + 0)] = tile[k];
+         tile2[y * TILE_SIZE + (x + 1)] = tile[k+1];
+         tile2[(y + 1) * TILE_SIZE + (x + 0)] = tile[k+2];
+         tile2[(y + 1) * TILE_SIZE + (x + 1)] = tile[k+3];
+      }
+   }
+   memcpy(tile, tile2, sizeof(tile2));
+}
+
+
+
 /**
  * Display a surface that's in a tiled configuration.  That is, all the
  * pixels for a TILE_SIZExTILE_SIZE block are contiguous in memory.
@@ -321,6 +354,8 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
 
          ximage->data = (char *) xm_buf->data + offset;
 
+         twiddle_tile((uint *) ximage->data);
+
          if (XSHM_ENABLED(xm_buf)) {
 #if defined(USE_XSHM) && !defined(XFree86Server)
             XShmPutImage(b->xm_visual->display, b->drawable, b->gc,
-- 
cgit v1.2.3


From 283ffdf99605c536d00e03ad6ec91a6f8e006fc2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:13:20 -0600
Subject: cell: checkpoint: remove more of the old per-fragment code

---
 src/gallium/drivers/cell/common.h              |   2 +
 src/gallium/drivers/cell/ppu/Makefile          |   1 -
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  60 ++-----------
 src/gallium/drivers/cell/spu/spu_main.c        | 115 +++----------------------
 src/gallium/drivers/cell/spu/spu_main.h        |  37 +-------
 5 files changed, 19 insertions(+), 196 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index a62530c64d..61d2b7d1ae 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -146,6 +146,8 @@ struct cell_command_logicop
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
    unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b5a6fcb8de..8699f3f8ec 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -28,7 +28,6 @@ SOURCES = \
 	cell_gen_fragment.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
-	cell_state_per_fragment.c \
 	cell_state_shader.c \
 	cell_pipe_state.c \
 	cell_screen.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 06777aac14..2bfb976c59 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -55,23 +55,6 @@ emit_state_cmd(struct cell_context *cell, uint cmd,
 void
 cell_emit_state(struct cell_context *cell)
 {
-   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) {
-      struct cell_command_logicop logicop;
-
-      if (cell->logic_op.store != NULL) {
-	 spe_release_func(& cell->logic_op);
-      }
-
-      cell_generate_logic_op(& cell->logic_op,
-			     & cell->blend->base,
-			     cell->framebuffer.cbufs[0]);
-
-      logicop.base = (intptr_t) cell->logic_op.store;
-      logicop.size = 64 * 4;
-      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop,
-		     sizeof(logicop));
-   }
-
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
@@ -91,7 +74,9 @@ cell_emit_state(struct cell_context *cell)
    }
 
 
-   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_DEPTH_STENCIL)) {
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
       /* XXX we don't want to always do codegen here.  We should have
        * a hash/lookup table to cache previous results...
        */
@@ -105,47 +90,12 @@ cell_emit_state(struct cell_context *cell)
       fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
       memcpy(&fops->code, spe_code.store,
              SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      fops->dsa = cell->depth_stencil->base;
+      fops->blend = cell->blend->base;
       /* free codegen buffer */
       spe_release_func(&spe_code);
    }
 
-   if (cell->dirty & CELL_NEW_BLEND) {
-      struct cell_command_blend blend;
-
-      if (cell->blend != NULL) {
-         blend.base = (intptr_t) cell->blend->code.store;
-         blend.size = cell->blend->code.num_inst * SPE_INST_SIZE;
-         blend.read_fb = TRUE;
-      }
-      else {
-         blend.base = 0;
-         blend.size = 0;
-         blend.read_fb = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend));
-   }
-
-   if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      struct cell_command_depth_stencil_alpha_test dsat;
-
-      if (cell->depth_stencil != NULL) {
-	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
-	 dsat.size = cell->depth_stencil->code.num_inst * SPE_INST_SIZE;
-	 dsat.read_depth = TRUE;
-	 dsat.read_stencil = FALSE;
-         dsat.state = cell->depth_stencil->base;
-      }
-      else {
-	 dsat.base = 0;
-	 dsat.size = 0;
-	 dsat.read_depth = FALSE;
-	 dsat.read_stencil = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat, sizeof(dsat));
-   }
-
    if (cell->dirty & CELL_NEW_SAMPLER) {
       uint i;
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 4e0ec15925..6afca19dfd 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -63,14 +63,6 @@ struct spu_vs_context draw;
 static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
     ALIGN16_ATTRIB;
 
-static unsigned char depth_stencil_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
-
-static unsigned char fb_blend_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
-
-static unsigned char logicop_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
 
 
 /**
@@ -240,8 +232,15 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
    /* Point function pointer at new code */
    spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
 }
 
 
@@ -303,89 +302,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
-#define NEW_FRAGMENT_FUNCTION 01
-
-static void
-cmd_state_blend(const struct cell_command_blend *state)
-{
-   if (Debug)
-      printf("SPU %u: BLEND: enabled %d\n",
-             spu.init.id,
-             (state->size != 0));
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(fb_blend_code_buffer,
-              (unsigned int) state->base,  /* src */
-              ROUNDUP16(state->size),
-              TAG_BATCH_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-      spu.blend = (blend_func) fb_blend_code_buffer;
-      spu.read_fb = state->read_fb;
-   }
-   else
-   {
-      spu.read_fb = FALSE;
-   }
-}
-
-
-static void
-cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state)
-{
-   if (Debug)
-      printf("SPU %u: DEPTH_STENCIL: ztest %d\n",
-             spu.init.id,
-             state->read_depth);
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(depth_stencil_code_buffer,
-	      (unsigned int) state->base,  /* src */
-	      ROUNDUP16(state->size),
-	      TAG_BATCH_BUFFER,
-	      0, /* tid */
-	      0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-   }
-   else
-   {
-      /* If there is no code, emit a return instruction.
-       */
-      depth_stencil_code_buffer[0] = 0x35;
-      depth_stencil_code_buffer[1] = 0x00;
-      depth_stencil_code_buffer[2] = 0x00;
-      depth_stencil_code_buffer[3] = 0x00;
-   }
-
-   spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
-   spu.read_depth = state->read_depth;
-   spu.read_stencil = state->read_stencil;
-   spu.depth_stencil_alpha = state->state;
-}
-
-
-static void
-cmd_state_logicop(const struct cell_command_logicop * code)
-{
-#if !NEW_FRAGMENT_FUNCTION
-   mfc_get(logicop_code_buffer,
-           (unsigned int) code->base,  /* src */
-           code->size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   spu.logicop = (logicop_func) logicop_code_buffer;
-#endif
-}
-
-
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -571,15 +487,6 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
-      case CELL_CMD_STATE_BLEND:
-         cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8);
-         break;
-      case CELL_CMD_STATE_DEPTH_STENCIL:
-         cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
-                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
-         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
@@ -614,19 +521,17 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       case CELL_CMD_STATE_BIND_VS:
+#if 01
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+#endif
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
                                 &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
-      case CELL_CMD_STATE_LOGICOP:
-         cmd_state_logicop((struct cell_command_logicop *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
-         break;
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
@@ -695,7 +600,9 @@ main_loop(void)
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
+#if 01
          spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 7ab34f5222..f0f8be47db 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -60,35 +60,6 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
-struct spu_frag_test_results {
-   qword mask;
-   qword depth;
-   qword stencil;
-};
-
-typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
-    qword pixel_depth, qword pixel_stencil, qword frag_depth,
-    qword frag_alpha, qword facing);
-
-
-struct spu_blend_results {
-   qword r;
-   qword g;
-   qword b;
-   qword a;
-};
-
-typedef struct spu_blend_results (*blend_func)(
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a);
-
-typedef struct spu_blend_results (*logicop_func)(
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword frag_mask);
-
-
 typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
 
 
@@ -147,16 +118,10 @@ struct spu_global
    struct spu_framebuffer fb;
 
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
 
    boolean read_depth;
    boolean read_stencil;
-   frag_test_func frag_test;  /**< Current depth/stencil test code */
-   
-   boolean read_fb;   /**< Does current blend mode require framebuffer read? */
-   blend_func blend;  /**< Current blend code */
-   qword const_blend_color[4] ALIGN16_ATTRIB;
-
-   logicop_func logicop;  /**< Current logicop code **/
 
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-- 
cgit v1.2.3


From aa4a08d429712fa516342ec02253c2591794ea5f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 10:25:38 -0600
Subject: cell: asst. clean-up

---
 src/gallium/drivers/cell/spu/spu_main.c | 23 +++++-----------
 src/gallium/drivers/cell/spu/spu_main.h | 47 +++++++++++++++------------------
 src/gallium/drivers/cell/spu/spu_tri.c  | 10 +++----
 3 files changed, 32 insertions(+), 48 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6afca19dfd..29686964d2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -231,13 +231,13 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    if (Debug)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops.code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
    /* Point function pointer at new code */
-   spu.fragment_ops.func = (spu_fragment_ops_func) spu.fragment_ops.code;
+   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
 
    spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
    spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
@@ -288,17 +288,6 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 0;
       break;
    }
-
-   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              12, 0, 4, 8, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              8, 4, 0, 12, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else
-      ASSERT(0);
 }
 
 
@@ -521,11 +510,11 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       case CELL_CMD_STATE_BIND_VS:
-#if 01
+#if 0
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
 #endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
@@ -600,7 +589,7 @@ main_loop(void)
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
-#if 01
+#if 0
          spu_execute_vertex_shader(&draw, &cmd.vs);
 #endif
          break;
@@ -631,7 +620,7 @@ one_time_init(void)
    /* Install default/fallback fragment processing function.
     * This will normally be overriden by a code-gen'd function.
     */
-   spu.fragment_ops.func = spu_fallback_fragment_ops;
+   spu.fragment_ops = spu_fallback_fragment_ops;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index f0f8be47db..d40539da83 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -60,9 +60,11 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
-typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
-
+/** Function for sampling textures */
+typedef vector float (*spu_sample_texture_func)(uint unit,
+                                                vector float texcoord);
 
+/** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       tile_t *colorTile,
                                       tile_t *depthStencilTile,
@@ -73,14 +75,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragAlpha,
                                       vector unsigned int mask);
 
-struct spu_fragment_ops
+struct spu_framebuffer
 {
-   uint code[SPU_MAX_FRAGMENT_OPS_INSTS];
-   spu_fragment_ops_func func;  /**< Current fragment ops function */
-} ALIGN16_ATTRIB;
-
-
-struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
    enum pipe_format color_format;
@@ -109,34 +105,31 @@ struct spu_texture
 
 
 /**
- * All SPU global/context state will be in singleton object of this type:
+ * All SPU global/context state will be in a singleton object of this type:
  */
 struct spu_global
 {
+   /** One-time init/constant info */
    struct cell_init_info init;
 
+   /*
+    * Current state
+    */
    struct spu_framebuffer fb;
-
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
-
-   boolean read_depth;
-   boolean read_stencil;
-
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-
    struct vertex_info vertex_info;
 
-   struct spu_fragment_ops fragment_ops;
-
-   /* XXX more state to come */
-
-
-   /** current color and Z tiles */
+   /** Current color and Z tiles */
    tile_t ctile ALIGN16_ATTRIB;
    tile_t ztile ALIGN16_ATTRIB;
 
+   /** Read depth/stencil tiles? */
+   boolean read_depth;
+   boolean read_stencil;
+
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
 
@@ -144,11 +137,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+   /** Current fragment ops machine code */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_ops_func fragment_ops;
 
-   /** for converting RGBA to PIPE_FORMAT_x colors */
-   vector unsigned char color_shuffle;
-
-   sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   /** Current texture sampler function */
+   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a5bf3270c7..f02cdd1f76 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -333,11 +333,11 @@ emit_quad( int x, int y, mask_t mask )
          /* Do all per-fragment/quad operations here, including:
           *  alpha test, z test, stencil test, blend and framebuffer writing.
           */
-         spu.fragment_ops.func(ix, iy, &spu.ctile, &spu.ztile,
-                               fragZ.v,
-                               soa_frag[0], soa_frag[1],
-                               soa_frag[2], soa_frag[3],
-                               mask);
+         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ.v,
+                          soa_frag[0], soa_frag[1],
+                          soa_frag[2], soa_frag[3],
+                          mask);
       }
 
    }
-- 
cgit v1.2.3


From aa66f08a21b791f338b519f0c2162cd8f7b3aeb0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 17:59:52 -0600
Subject: cell: initial support for fragment shader code generation.

TGSI shaders are translated into SPE instructions which are then sent to
the SPEs for execution.  Only a few opcodes work, no swizzling yet, no
support for constants/immediates, etc.
---
 src/gallium/drivers/cell/common.h                |  15 +
 src/gallium/drivers/cell/ppu/Makefile            |   1 +
 src/gallium/drivers/cell/ppu/cell_context.h      |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 523 +++++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_gen_fp.h       |  42 ++
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  16 +
 src/gallium/drivers/cell/ppu/cell_state_shader.c |   8 +-
 src/gallium/drivers/cell/spu/spu_main.c          |  25 +-
 src/gallium/drivers/cell/spu/spu_main.h          |  15 +
 src/gallium/drivers/cell/spu/spu_tri.c           |  35 ++
 10 files changed, 678 insertions(+), 3 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_gen_fp.h

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index e989d8c2e5..cb0631baf5 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -92,6 +92,7 @@
 #define CELL_CMD_STATE_UNIFORMS      16
 #define CELL_CMD_STATE_VS_ARRAY_INFO 17
 #define CELL_CMD_STATE_BIND_VS       18
+#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
@@ -125,6 +126,20 @@ struct cell_command_fragment_ops
 };
 
 
+/** Max instructions for fragment programs */
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+
+/**
+ * Command to send a fragment progra to SPUs.
+ */
+struct cell_command_fragment_program
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   uint num_inst;        /**< Number of instructions */
+   unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 8699f3f8ec..b28f4c5c31 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -26,6 +26,7 @@ SOURCES = \
 	cell_draw_arrays.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
+	cell_gen_fp.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
 	cell_state_shader.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 8cec9f45b2..14914b9c6f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -61,6 +61,7 @@ struct cell_fragment_shader_state
 {
    struct pipe_shader_state shader;
    struct tgsi_shader_info info;
+   struct spe_function code;
    void *data;
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
new file mode 100644
index 0000000000..6ffe94eb14
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -0,0 +1,523 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU fragment program/shader code.
+ *
+ * Note that we generate SOA-style code here.  So each TGSI instruction
+ * operates on four pixels (and is translated into four SPU instructions,
+ * generally speaking).
+ *
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fp.h"
+
+
+/** Set to 1 to enable debug/disassembly printfs */
+#define DISASSEM 01
+
+
+/**
+ * Context needed during code generation.
+ */
+struct codegen
+{
+   int inputs_reg;      /**< 1st function parameter */
+   int outputs_reg;     /**< 2nd function parameter */
+   int constants_reg;   /**< 3rd function parameter */
+   int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+
+   int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
+
+   /** Per-instruction temps / intermediate temps */
+   int num_itemps;
+   int itemps[3];
+
+   struct spe_function *f;
+   boolean error;
+};
+
+
+/**
+ * Allocate an intermediate temporary register.
+ */
+static int
+get_itemp(struct codegen *gen)
+{
+   int t = spe_allocate_available_register(gen->f);
+   assert(gen->num_itemps < Elements(gen->itemps));
+   gen->itemps[gen->num_itemps++] = t;
+   return t;
+}
+
+/**
+ * Free all intermediate temporary registers.  To be called after each
+ * instruction has been emitted.
+ */
+static void
+free_itemps(struct codegen *gen)
+{
+   int i;
+   for (i = 0; i < gen->num_itemps; i++) {
+      spe_release_register(gen->f, gen->itemps[i]);
+   }
+   gen->num_itemps = 0;
+}
+
+
+/**
+ * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
+ * The register is allocated and initialized upon the first call.
+ */
+static int
+get_const_one_reg(struct codegen *gen)
+{
+   if (gen->one_reg <= 0) {
+      gen->one_reg = spe_allocate_available_register(gen->f);
+   }
+
+   /* one = {1.0, 1.0, 1.0, 1.0} */
+   spe_load_float(gen->f, gen->one_reg, 1.0f);
+#if DISASSEM
+   printf("il\tr%d, 1.0f\n", gen->one_reg);
+#endif
+
+   return gen->one_reg;
+}
+
+
+/**
+ * Return the index of the SPU temporary containing the named TGSI
+ * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
+ * just return the corresponding SPE register.  If the TGIS register
+ * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
+ * and emit an SPE load instruction.
+ */
+static int
+get_src_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_src_register *src)
+{
+   int reg;
+
+   /* XXX need to examine src swizzle info here.
+    * That will involve changing the channel var...
+    */
+
+
+   switch (src->SrcRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      reg = gen->temp_regs[src->SrcRegister.Index][channel];
+      break;
+   case TGSI_FILE_INPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = src->SrcRegister.Index * 4 + channel;
+         reg = get_itemp(gen);
+         /* Load:  reg = memory[(machine_reg) + offset] */
+         spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+#if DISASSEM
+         printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
+#endif
+      }
+      break;
+   case TGSI_FILE_IMMEDIATE:
+      /* xxx fall-through for now / fix */
+   case TGSI_FILE_CONSTANT:
+      /* xxx fall-through for now / fix */
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * Return the index of an SPE register to use for the given TGSI register.
+ * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
+ * corresponding SPE register is returned.  If the TGSI register is
+ * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
+ * See store_dest_reg() below...
+ */
+static int
+get_dst_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_dst_register *dest)
+{
+   int reg;
+
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      break;
+   case TGSI_FILE_OUTPUT:
+      reg = get_itemp(gen);
+      break;
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * When a TGSI instruction is writing to an output register, this
+ * function emits the SPE store instruction to store the value_reg.
+ * \param value_reg  the SPE register containing the value to store.
+ *                   This would have been returned by get_dst_reg().
+ */
+static void
+store_dest_reg(struct codegen *gen,
+               int value_reg, int channel,
+               const struct tgsi_full_dst_register *dest)
+{
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      /* no-op */
+      break;
+   case TGSI_FILE_OUTPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = dest->DstRegister.Index * 4 + channel;
+         /* Store: memory[(machine_reg) + offset] = reg */
+         spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+#if DISASSEM
+         printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
+#endif
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static boolean
+emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* XXX we don't always need to actually emit a mov instruction here */
+         spe_move(gen->f, dst_reg, src_reg);
+#if DISASSEM
+         printf("mov\tr%d, r%d\n", dst_reg, src_reg);
+#endif
+         store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
+ * becomes (up to) four SPU "fa" instructions because we're doing SOA
+ * processing.
+ */
+static boolean
+emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   /* Loop over Red/Green/Blue/Alpha channels */
+   for (ch = 0; ch < 4; ch++) {
+      /* If the dest R, G, B or A writemask is enabled... */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* get indexes of the two src, one dest SPE registers */
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Emit actual SPE instruction: d = s1 + s2 */
+         spe_fa(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+
+         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         /* Free any intermediate temps we allocated */
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit multiply.  See emit_ADD for comments.
+ */
+static boolean
+emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = s1 * s2 */
+         spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit set-if-greater-than.
+ * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
+ * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
+ * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ */
+static boolean
+emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 > s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+#if DISASSEM
+         printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+#endif
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+#if DISASSEM
+         printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
+#endif
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+/**
+ * Emit END instruction.
+ * We just return from the shader function at this point.
+ *
+ * Note that there may be more code after this that would be
+ * called by TGSI_OPCODE_CALL.
+ */
+static boolean
+emit_END(struct codegen *gen)
+{
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+#if DISASSEM
+   printf("bi\trRA\n");
+#endif
+   return true;
+}
+
+
+/**
+ * Emit code for the given instruction.  Just a big switch stmt.
+ */
+static boolean
+emit_instruction(struct codegen *gen,
+                 const struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+      return emit_MOV(gen, inst);
+   case TGSI_OPCODE_MUL:
+      return emit_MUL(gen, inst);
+   case TGSI_OPCODE_ADD:
+      return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SGT:
+      return emit_SGT(gen, inst);
+   case TGSI_OPCODE_END:
+      return emit_END(gen);
+
+   /* XXX lots more cases to do... */
+
+   default:
+      return false;
+   }
+
+   return true;
+}
+
+
+
+/**
+ * Emit "code" for a TGSI declaration.
+ * We only care about TGSI TEMPORARY register declarations at this time.
+ * For each TGSI TEMPORARY we allocate four SPE registers.
+ */
+static void
+emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+{
+   int i, ch;
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+#if DISASSEM
+      printf("Declare temp reg %d .. %d\n",
+             decl->DeclarationRange.First,
+             decl->DeclarationRange.Last);
+#endif
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last;
+           i++) {
+         for (ch = 0; ch < 4; ch++) {
+            gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+         }
+
+         /* XXX if we run out of SPE registers, we need to spill
+          * to SPU memory.  someday...
+          */
+
+#if DISASSEM
+         printf("  SPE regs: %d %d %d %d\n",
+                gen->temp_regs[i][0],
+                gen->temp_regs[i][1],
+                gen->temp_regs[i][2],
+                gen->temp_regs[i][3]);
+#endif
+      }
+      break;
+   default:
+      ; /* ignore */
+   }
+}
+
+
+/**
+ * Translate TGSI shader code to SPE instructions.  This is done when
+ * the state tracker gives us a new shader (via pipe->create_fs_state()).
+ *
+ * \param cell    the rendering context (in)
+ * \param tokens  the TGSI shader (in)
+ * \param f       the generated function (out)
+ */
+boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f)
+{
+   struct tgsi_parse_context parse;
+   struct codegen gen;
+
+   memset(&gen, 0, sizeof(gen));
+   gen.f = f;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   gen.inputs_reg = 3;     /* pointer to inputs array */
+   gen.outputs_reg = 4;    /* pointer to outputs array */
+   gen.constants_reg = 5;  /* pointer to constants array */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, gen.inputs_reg);
+   spe_allocate_register(f, gen.outputs_reg);
+   spe_allocate_register(f, gen.constants_reg);
+
+#if DISASSEM
+   printf("Begin %s\n", __FUNCTION__);
+   tgsi_dump(tokens, 0);
+#endif
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+#if 0
+         if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
+            goto fail;
+#endif
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+            gen.error = true;
+         }
+         break;
+
+      default:
+         assert(0);
+
+      }
+   }
+
+
+   if (gen.error) {
+      /* terminate the SPE code */
+      return emit_END(&gen);
+   }
+
+#if DISASSEM
+   printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+   printf("End %s\n", __FUNCTION__);
+#endif
+
+   tgsi_parse_free( &parse );
+
+   return !gen.error;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
new file mode 100644
index 0000000000..99faea7046
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef CELL_GEN_FP_H
+#define CELL_GEN_FP_H
+
+
+
+extern boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f);
+
+
+#endif /* CELL_GEN_FP_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 3ebf0749ad..2da3097983 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -73,6 +73,22 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_FS)) {
+      /* Send new fragment program to SPUs */
+      struct cell_command_fragment_program *fp
+            = cell_batch_alloc(cell, sizeof(*fp));
+      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+      fp->num_inst = cell->fs->code.num_inst;
+      memcpy(&fp->code, cell->fs->code.store,
+             SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+      if (0) {
+         int i;
+         printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n");
+         for (i = 0; i < fp->num_inst; i++) {
+            printf(" %3d: 0x%08x\n", i, fp->code[i]);
+         }
+      }
+   }
 
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 97e44eeb1a..3a0d066da2 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -34,7 +34,7 @@
 
 #include "cell_context.h"
 #include "cell_state.h"
-
+#include "cell_gen_fp.h"
 
 
 /** cast wrapper */
@@ -61,7 +61,7 @@ static void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
-   /*struct cell_context *cell = cell_context(pipe);*/
+   struct cell_context *cell = cell_context(pipe);
    struct cell_fragment_shader_state *cfs;
 
    cfs = CALLOC_STRUCT(cell_fragment_shader_state);
@@ -76,6 +76,8 @@ cell_create_fs_state(struct pipe_context *pipe,
 
    tgsi_scan_shader(templ->tokens, &cfs->info);
 
+   cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code);
+
    return cfs;
 }
 
@@ -102,6 +104,8 @@ cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs);
 
+   spe_release_func(&cfs->code);
+
    FREE((void *) cfs->shader.tokens);
    FREE(cfs);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 2a7cb75f59..78260c4259 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -232,7 +232,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info */
+   /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
@@ -244,6 +244,21 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 }
 
 
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   if (Debug)
+      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -473,6 +488,14 @@ cmd_batch(uint opcode)
             pos += sizeof(*fops) / 8;
          }
          break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index d40539da83..2c7b625840 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -75,6 +75,12 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragAlpha,
                                       vector unsigned int mask);
 
+/** Function for running fragment program */
+typedef void (*spu_fragment_program_func)(vector float *inputs,
+                                          vector float *outputs,
+                                          vector float *constants);
+
+
 struct spu_framebuffer
 {
    void *color_start;              /**< addr of color surface in main memory */
@@ -142,9 +148,18 @@ struct spu_global
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
+   /** Current fragment program machine code */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_program_func fragment_program;
+
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
+   /** Fragment program constants (XXX preliminary/used) */
+#define MAX_CONSTANTS 32
+   vector float constants[MAX_CONSTANTS];
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index f02cdd1f76..8b93878192 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -314,7 +314,42 @@ emit_quad( int x, int y, mask_t mask )
       }
       else {
          /* simple shading */
+#if 0
          eval_coeff(1, (float) x, (float) y, colors);
+
+#else
+         /* XXX new fragment program code */
+
+         if (spu.fragment_program) {
+            vector float inputs[4*4], outputs[2*4];
+
+            /* setup inputs */
+            eval_coeff(1, (float) x, (float) y, inputs);
+
+            /* Execute the current fragment program */
+            spu.fragment_program(inputs, outputs, spu.constants);
+
+            /* Copy outputs */
+            colors[0] = outputs[0*4+0];
+            colors[1] = outputs[0*4+1];
+            colors[2] = outputs[0*4+2];
+            colors[3] = outputs[0*4+3];
+
+            if (0 && spu.init.id==0 && y == 48) {
+               printf("colors[0] = %f %f %f %f\n",
+                      spu_extract(colors[0], 0),
+                      spu_extract(colors[0], 1),
+                      spu_extract(colors[0], 2),
+                      spu_extract(colors[0], 3));
+               printf("colors[1] = %f %f %f %f\n",
+                      spu_extract(colors[1], 0),
+                      spu_extract(colors[1], 1),
+                      spu_extract(colors[1], 2),
+                      spu_extract(colors[1], 3));
+            }
+
+         }
+#endif
       }
 
 
-- 
cgit v1.2.3


From aca74a4d92ba6f99d756ab703a78efc3918b3840 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 17:55:10 -0600
Subject: cell: make sure the fragment ops and fragment shader code buffer is
 at a 32-byte boundary

To make sure even/odd instructions hit the right pipes.
---
 src/gallium/drivers/cell/spu/spu_main.c | 4 +++-
 src/gallium/drivers/cell/spu/spu_main.h | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6b62417558..b4d30228f7 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -705,6 +705,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0);
 
    one_time_init();
 
@@ -721,7 +723,7 @@ main(main_param_t speid, main_param_t argp)
 
 #if 0
    if (spu.init.id==0)
-      spu_test_misc();
+      spu_test_misc(spu.init.id);
 #endif
 
    main_loop();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b625840..72e540fcff 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -143,13 +143,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops machine code, at 32-byte boundary */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
-   /** Current fragment program machine code */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment program machine code, at 32-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
-- 
cgit v1.2.3


From bac5900a14b85a6513fae7eef19a5ed1d26b2011 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 09:58:17 -0600
Subject: cell: align instruction buffers to 8-byte, not 32-byte boundary

---
 src/gallium/drivers/cell/spu/spu_main.c | 4 ++--
 src/gallium/drivers/cell/spu/spu_main.h | 8 ++++----
 src/gallium/include/pipe/p_compiler.h   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6ef65d5645..8f3e3785c1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -706,8 +706,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0);
-   ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 72e540fcff..29a305232e 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -143,13 +143,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code, at 32-byte boundary */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB;
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
-   /** Current fragment program machine code, at 32-byte boundary */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB;
+   /** Current fragment program machine code, at 8-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 1e702c7fa7..7bcebd3d6b 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -144,12 +144,12 @@ typedef unsigned char boolean;
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
 #define ALIGN16_ASSIGN(NAME) NAME##___aligned
 #define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
-#define ALIGN32_ATTRIB  __attribute__(( aligned( 32 ) ))
+#define ALIGN8_ATTRIB  __attribute__(( aligned( 8 ) ))
 #else
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
 #define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
 #define ALIGN16_ATTRIB
-#define ALIGN32_ATTRIB
+#define ALIGN8_ATTRIB
 #endif
 
 
-- 
cgit v1.2.3


From afaa53040bd01ca86762e7d7b1a5a65810767921 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 3 Oct 2008 18:00:43 -0600
Subject: CELL: changes to generate SPU code for stenciling

This set of code changes are for stencil code generation
support.  Both one-sided and two-sided stenciling are supported.
In addition to the raw code generation changes, these changes had
to be made elsewhere in the system:

- Added new "register set" feature to the SPE assembly generation.
  A "register set" is a way to allocate multiple registers and free
  them all at the same time, delegating register allocation management
  to the spe_function unit.  It's quite useful in complex register
  allocation schemes (like stenciling).

- Added and improved SPE macro calculations.
  These are operations between registers and unsigned integer
  immediates.  In many cases, the calculation can be performed
  with a single instruction; the macros will generate the
  single instruction if possible, or generate a register load
  and register-to-register operation if not.  These macro
  functions are: spe_load_uint() (which has new ways to
  load a value in a single instruction), spe_and_uint(),
  spe_xor_uint(), spe_compare_equal_uint(), and spe_compare_greater_uint().

- Added facing to fragment generation.  While rendering, the rasterizer
  needs to be able to determine front- and back-facing fragments, in order
  to correctly apply two-sided stencil.  That requires these changes:
  - Added front_winding field to the cell_command_render block, so that
    the state tracker could communicate to the rasterizer what it
    considered to be the front-facing direction.
  - Added fragment facing as an input to the fragment function.
  - Calculated facing is passed during emit_quad().
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        | 246 +++++-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h        |  41 +-
 src/gallium/drivers/cell/common.h                  |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 881 ++++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_render.c         |   1 +
 src/gallium/drivers/cell/ppu/cell_vbuf.c           |   1 +
 src/gallium/drivers/cell/spu/spu_main.h            |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |  19 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_render.c          |   4 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  35 +-
 src/gallium/drivers/cell/spu/spu_tri.h             |   2 +-
 12 files changed, 1091 insertions(+), 146 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 491141f190..8a87e9abb1 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -359,14 +359,21 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
+    register unsigned int i;
+
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
 
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
-    p->regs[0] = ~7;
-    p->regs[1] = (1U << (80 - 64)) - 1;
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
 
     p->print = false;
     p->indent = 0;
@@ -398,12 +405,8 @@ int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      const uint64_t mask = (1ULL << (i % 64));
-      const unsigned idx = i / 64;
-
-      assert(idx < 2);
-      if ((p->regs[idx] & mask) != 0) {
-         p->regs[idx] &= ~mask;
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
          return i;
       }
    }
@@ -417,31 +420,68 @@ int spe_allocate_available_register(struct spe_function *p)
  */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
-
    assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) != 0);
-
-   p->regs[idx] &= ~(1ULL << bit);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
    return reg;
 }
 
 
 /**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
 
-   assert(idx < 2);
+   p->regs[reg] = 0;
+}
 
-   assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   register unsigned int i;
+
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
 
-   p->regs[idx] |= (1ULL << bit);
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]--;
+   }
 }
 
 
@@ -603,8 +643,10 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 {
    /* If the whole value is in the lower 18 bits, use ila, which
     * doesn't sign-extend.  Otherwise, if the two halfwords of
-    * the constant are identical, use ilh.  Otherwise, we have
-    * to use ilhu followed by iohl.
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
    if ((ui & 0xfffc0000) == ui) {
       spe_ila(p, rT, ui);
@@ -612,13 +654,171 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
    else if ((ui >> 16) == (ui & 0xffff)) {
       spe_ilh(p, rT, ui & 0xffff);
    }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
    else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
       spe_ilhu(p, rT, ui >> 16);
       if (ui & 0xffff)
          spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
+/* This function is constructed identically to spe_sor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+/* This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 61c7edeb60..cd2e245409 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -53,17 +53,26 @@ struct spe_function
    uint num_inst;
    uint max_inst;
 
-    /**
-     * Mask of used / unused registers
-     *
-     * Each set bit corresponds to an available register.  Each cleared bit
-     * corresponds to an allocated register.
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
      *
      * \sa
      * spe_allocate_register, spe_allocate_available_register,
-     * spe_release_register
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
      */
-    uint64_t regs[SPE_NUM_REGS / 64];
+    unsigned char regs[SPE_NUM_REGS];
 
     boolean print; /**< print/dump instructions as they're emitted? */
     int indent;    /**< number of spaces to indent */
@@ -77,6 +86,8 @@ extern unsigned spe_code_size(const struct spe_function *p);
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
 
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
@@ -307,6 +318,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);
 extern void
 spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 99329fd8e2..c223bc1744 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -227,6 +227,7 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 653afc235d..f920ae13b4 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,10 +54,12 @@
  * \param ifragZ_reg  register containing integer fragment Z values (in)
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
  */
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
-               struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
    /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
@@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
    }
+
+   return false;
 }
 
 
@@ -238,22 +243,34 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  * it and have to allocate and load it again unnecessarily.
  */
 static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
 {
    if (*is_already_set) return;
    *r = spe_allocate_available_register(f);
-   spe_load_float(f, *r, value);
-   *is_already_set = true;
 }
 
 static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
 {
     if (!*is_already_set) return;
     spe_release_register(f, r);
     *is_already_set = false;
 }
 
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
+
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
  * \param f          SPE function to append instruction onto.
@@ -1117,6 +1134,633 @@ gen_colormask(struct spe_function *f,
     spe_release_register(f, colormask_reg);
 }
 
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      /* stencil_pass = mask & (s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      /* stencil_fail = mask & ~stencil_pass */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* stencil_pass = mask & ~(s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* stencil_pass = mask & (s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_LESS: {
+      /* stencil_pass = mask & (reference > s) */
+      /* There's no convenient Compare Less Than Immediate instruction, so
+       * we'll have to do this one the harder way, by loading a register and 
+       * comparing directly.  Compare Logical Greater Than Word (clgt) 
+       * treats its operands as unsigned - no sign extension.
+       */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_LEQUAL:
+      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL: {
+      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+      /* As above, we have to do this by loading a register */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = mask & 1 = mask */
+      spe_move(f, stencil_pass_reg, mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
+                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+   unsigned zfail_op, back_zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(dsa->stencil[0].enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes.  In particular, that means that the "zfail_op" (and the backfacing
+    * counterpart, if active) are not considered - a failing stencil test will
+    * trigger the "fail_op", and a passing stencil test will trigger the
+    * "zpass_op".
+    *
+    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+    * we keep them from being calculated.
+    */
+   if (dsa->depth.enabled) {
+      zfail_op = dsa->stencil[0].zfail_op;
+      back_zfail_op = dsa->stencil[1].zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+      back_zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == dsa->stencil[0].fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+
+   /* If two-sided stencil is enabled, we have more work to do. */
+   if (!dsa->stencil[1].enabled) {
+      /* This just flags that the registers need not be deallocated later */
+      *back_fail_reg = fbS_reg;
+      *back_zfail_reg = fbS_reg;
+      *back_zpass_reg = fbS_reg;
+   }
+   else {
+      /* Same calculations as above, but for the back stencil */
+      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_fail_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+         *back_fail_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == zfail_op) {
+         *back_fail_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+         *back_fail_reg = *zpass_reg;
+      }
+      else {
+         *back_fail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_fail_reg);
+      }
+
+      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zfail_reg = fbS_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].fail_op) {
+         *back_zfail_reg = *fail_reg;
+      }
+      else if (back_zfail_op == zfail_op) {
+         *back_zfail_reg = *zfail_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+         *back_zfail_reg = *zpass_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[1].fail_op) {
+         *back_zfail_reg = *back_fail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zfail_reg);
+      }
+
+      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zpass_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+         *back_zpass_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == zfail_op) {
+         *back_zpass_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+         *back_zpass_reg = *zpass_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+         *back_zpass_reg = *back_fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+         *back_zpass_reg = *back_zfail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zpass_reg);
+      }
+   } /* End of calculations for back-facing stencil */
+}
+
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const int const facing_reg,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_allocate_register_set(f);
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    */
+   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Trivial: don't need to calculate stencil values, and don't need to 
+       * write them back to the framebuffer.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+         spe_release_register(f, back_write_mask_reg);
+      }
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* If two-sided stenciling is on, generate code to run the stencil
+    * test on the backfacing stencil as well, and combine the two results
+    * into the one correct result based on facing.
+    */
+   if (dsa->stencil[1].enabled) {
+      unsigned int temp_reg = spe_allocate_available_register(f);
+      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+      spe_release_register(f, temp_reg);
+   }
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      gen_get_stencil_values(f, dsa, fbS_reg, 
+         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
+         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
+         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+      /* Tricky, tricky, tricky - the things we do to create optimal
+       * code...
+       *
+       * The various stencil values registers may overlap with each other
+       * and with fbS_reg arbitrarily (as any particular operation is
+       * only calculated once and stored in one register, no matter
+       * how many times it is used).  So we can't change the values 
+       * within those registers directly - if we change a value in a
+       * register that's being referenced by two different calculations,
+       * we've just unwittingly changed the second value as well...
+       *
+       * Avoid this by allocating new registers to hold the results
+       * (there may be 2, if the depth test is off, or 3, if it is on).
+       * These will be released as part of the register set.
+       */
+      if (!dsa->stencil[1].enabled) {
+         /* The easy case: if two-sided stenciling is *not* enabled, we
+          * just use the front-sided values.
+          */
+         stencil_fail_values = front_stencil_fail_values;
+         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+      }
+      else { /* two-sided stencil enabled */
+         /* Allocate new registers for the needed merged values */
+         stencil_fail_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+         if (dsa->depth.enabled) {
+            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+         }
+         else {
+            stencil_pass_depth_fail_values = fbS_reg;
+         }
+         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+      }
+   }
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_move(f, newS_reg, fbS_reg);
+         modified_buffers = true;
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+      }
+
+      /* Same for the stencil pass/depth pass mask */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+         spe_release_register(f, stencil_pass_depth_pass_mask);
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
@@ -1156,6 +1800,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
+   const int facing_reg = 13; /* uint */
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1183,6 +1828,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
+   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1195,6 +1841,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
+      spe_comment(f, 0, "Computing tile location in memory");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
       spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
@@ -1205,124 +1852,164 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
-
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
       const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
       boolean write_depth_stencil;
 
-      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
-      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Loading Z/stencil tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
       spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
 
-      if (dsa->depth.enabled) {
-         /* Extract Z bits from fbZS_reg into fbZ_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            int mask_reg = spe_allocate_available_register(f);
-            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
-            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
-            spe_release_register(f, mask_reg);
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 32-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 16-bit Z values now */
-         }
-         else {
-            ASSERT(0);  /* invalid format */
-         }
-
-         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 8 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 16 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-         }
-      }
-      else {
-         /* no Z test, but set Z to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       */
+      switch(zs_format) {
+
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            if (dsa->depth.enabled) {
+               /* We need the Z part at least */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* four 24-bit Z values in the low-order bits */
+               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* four 8-bit Z values in the high-order bits */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+            }
+            break;
+
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* shift by 8 to get the upper 24-bit values */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* 8-bit stencil in the low-order bits - mask them out */
+               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+            }
+            break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 32-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            }
+            /* No stencil, so can't do anything there */
+            break;
+
+         case PIPE_FORMAT_Z16_UNORM:
+            if (dsa->depth.enabled) {
+               /* XXX Not sure this is correct, but it was here before, so we're
+                * going with it for now
+                */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 16-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            }
+            /* No stencil */
+            break;
+
+         default:
+            ASSERT(0); /* invalid format */
       }
 
-
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
       if (dsa->stencil[0].enabled) {
-         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX extract with a shift */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* XXX extract with a mask */
-            ASSERT(0);
-         }
-      }
-      else {
-         /* no stencil test, but set to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
-      }
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
 
-      if (dsa->stencil[0].enabled) {
-         /* XXX this may involve depth testing too */
-         // gen_stencil_test(dsa, f, ... );
-         ASSERT(0);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
-         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
       }
-
-      /* do we need to write Z and/or Stencil back into framebuffer? */
-      write_depth_stencil = (dsa->depth.writemask |
-                             dsa->stencil[0].write_mask |
-                             dsa->stencil[1].write_mask);
+      else {
+         write_depth_stencil = false;
+      }
 
       if (write_depth_stencil) {
          /* Merge latest Z and Stencil values into fbZS_reg.
           * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
           * fbS_reg has four 8-bit Z values in bits [7..0].
           */
+         spe_comment(f, 0, "Storing depth/stencil values");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+            else {
+               spe_move(f, fbZS_reg, fbZ_reg);
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
             spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
             spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
@@ -1341,11 +2028,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
-      spe_release_register(f, fbZ_reg);
-      spe_release_register(f, fbS_reg);
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
 
-
    /* Get framebuffer quad/colors.  We'll need these for blending,
     * color masking, and to obey the quad/pixel mask.
     * Load: fbRGBA_reg = memory[color_tile + quad_offset]
@@ -1354,8 +2040,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
     */
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
-
    if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
       gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
@@ -1369,19 +2055,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert fragment colors to framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
 
       if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
       if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
          gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
-
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
        *    rgba[i] = rgba[i];
@@ -1393,6 +2081,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
+      spe_comment(f, 0, "Store framebuffer colors");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e..79cb8df82f 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
       struct cell_command_render *render = &cell_global.command[i].render;
       render->prim_type = PIPE_PRIM_TRIANGLES;
       render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
       render->vertex_size = cell->vertex_info->size * 4;
       render->xmin = cell->prim_buffer.xmin;
       render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..578ddf62dc 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
+      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 29a305232e..1cd577c23c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -73,7 +73,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask);
+                                      vector unsigned int mask,
+                                      uint facing);
 
 /** Function for running fragment program */
 typedef void (*spu_fragment_program_func)(vector float *inputs,
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f107764fb2..d252fa6dc1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask)
+                          vector unsigned int mask,
+                          uint facing)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
@@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       /* Form bitmask depending on color buffer format and colormask bits */
       switch (spu.fb.color_format) {
       case PIPE_FORMAT_A8R8G8B8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x00ff0000; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x0000ff00; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0x000000ff; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0xff000000; /* alpha */
          break;
       case PIPE_FORMAT_B8G8R8A8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x0000ff00; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x00ff0000; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0xff000000; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0x000000ff; /* alpha */
          break;
       default:
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index f817abf046..a61689c83a 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask);
+                          vector unsigned int mask,
+                          uint facing);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc98881..82dbeb26b7 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("SPU %u: RENDER done\n",
              spu.init.id);
 }
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 0a8fb56a62..6039cd80b2 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -118,6 +118,8 @@ struct setup_stage {
 
    float oneoverarea;
 
+   uint facing;
+
    uint tx, ty;
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
@@ -274,7 +276,7 @@ eval_z(float x, float y)
  * overall.
  */
 static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
 {
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -344,7 +346,8 @@ emit_quad( int x, int y, mask_t mask )
                              fragZ,
                              soa_frag[0], soa_frag[1],
                              soa_frag[2], soa_frag[3],
-                             mask);
+                             mask,
+                             setup.facing);
          }
 
       }
@@ -379,7 +382,8 @@ emit_quad( int x, int y, mask_t mask )
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask);
+                          mask,
+                          setup.facing);
       }
    }
 }
@@ -483,7 +487,7 @@ static void flush_spans( void )
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
 #if 1
-      emit_quad( x, setup.span.y, calculate_mask( x ) );
+      emit_quad( x, setup.span.y, calculate_mask( x ));
 #endif
    }
 
@@ -902,13 +906,28 @@ static void subtriangle( struct edge *eleft,
    eright->sy += lines;
 }
 
+static float
+determinant( const float *v0,
+             const float *v1,
+             const float *v2 )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
 
 /**
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -919,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
+   /* Before we sort vertices, determine the facing of the triangle,
+    * which will be needed for front/back-face stencil application
+    */
+   float det = determinant(v0, v1, v2);
+   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c9..abc3d35160 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From 800c350d71132bbb5126bd89310df540332978f4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:14:27 -0600
Subject: cell: add support for fragment shader constant buffers

---
 src/gallium/drivers/cell/common.h                |  1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 10 +++++++++-
 src/gallium/drivers/cell/ppu/cell_state.h        |  5 +++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c   | 19 +++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_state_shader.c |  5 ++++-
 src/gallium/drivers/cell/spu/spu_command.c       | 22 ++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.h          |  8 +++++---
 7 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c223bc1744..d261c1a640 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -94,6 +94,7 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 131a2356fe..3065869d04 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -215,7 +215,15 @@ get_src_reg(struct codegen *gen,
          reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
          break;
       case TGSI_FILE_CONSTANT:
-         /* xxx fall-through for now / fix */
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset);
+         }
+         break;
       default:
          assert(0);
       }
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a3..b193170f9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
 #define CELL_NEW_TEXTURE       0x800
 #define CELL_NEW_VERTEX        0x1000
 #define CELL_NEW_VS            0x2000
-#define CELL_NEW_CONSTANTS     0x4000
-#define CELL_NEW_VERTEX_INFO   0x8000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
 
 
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index a36fd3a601..cbfa393cfb 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,6 +25,7 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_gen_fragment.h"
@@ -162,6 +163,24 @@ cell_emit_state(struct cell_context *cell)
       }
    }
 
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+      uint64_t *ibuf = (uint64_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[1] = num_const;
+      j = 4;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2..54a17eaf2b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -197,7 +197,10 @@ cell_set_constant_buffer(struct pipe_context *pipe,
                         buf->buffer);
    cell->constants[shader].size = buf->size;
 
-   cell->dirty |= CELL_NEW_CONSTANTS;
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ec9da5d887..91a4c137e7 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -231,6 +231,25 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 }
 
 
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+   const uint num_const = buffer[pos + 1];
+   const float *constants = (const float *) &buffer[pos + 2];
+   uint i;
+
+   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 8-byte words) */
+   return pos + 2 + num_const / 2;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -456,6 +475,9 @@ cmd_batch(uint opcode)
             pos += sizeof(*fp) / 8;
          }
          break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 1cd577c23c..82c9c69a3a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,9 @@
 #define MAX_HEIGHT 1024
 
 
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+
+
 /**
  * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
  * The data may be addressed through several different types.
@@ -157,9 +160,8 @@ struct spu_global
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
-   /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
-   vector float constants[MAX_CONSTANTS];
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
 
 } ALIGN16_ATTRIB;
 
-- 
cgit v1.2.3


From 3b07c28dee74c7aa3be5efac8084d610675af291 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 10:55:08 -0600
Subject: cell: do texture sampling/filtering for four pixels at a time.

---
 src/gallium/drivers/cell/spu/spu_command.c |  11 ++-
 src/gallium/drivers/cell/spu/spu_funcs.c   |   4 +
 src/gallium/drivers/cell/spu/spu_main.h    |  19 ++++-
 src/gallium/drivers/cell/spu/spu_texture.c | 125 ++++++++++++++++++++++++++++-
 src/gallium/drivers/cell/spu/spu_texture.h |  12 +++
 5 files changed, 161 insertions(+), 10 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 91a4c137e7..c59be7defd 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,10 +301,14 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
       spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
+      spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
+   }
+   else {
       spu.sample_texture[sampler->unit] = sample_texture_nearest;
+      spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
+   }
 }
 
 
@@ -323,6 +327,9 @@ cmd_state_texture(const struct cell_command_texture *texture)
    spu.texture[unit].width = width;
    spu.texture[unit].height = height;
 
+   spu.texture[unit].width4 = spu_splats((float) width);
+   spu.texture[unit].height4 = spu_splats((float) height);
+
    spu.texture[unit].tiles_per_row = width / TILE_SIZE;
 
    spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 7dd7fcd253..13c234ea2e 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,6 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
 {
    //const uint unit = 0;
    struct vec_4x4 colors;
+#if 0
    vector float coords[4];
 
    coords[0] = s;
@@ -121,6 +122,9 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
    colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
 
    _transpose_matrix4x4(colors.v, colors.v);
+#else
+   spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
+#endif
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 82c9c69a3a..5d14be51c2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -67,6 +67,14 @@ typedef union {
 typedef vector float (*spu_sample_texture_func)(uint unit,
                                                 vector float texcoord);
 
+typedef void (*spu_sample_texture4_func)(vector float s,
+                                         vector float t,
+                                         vector float r,
+                                         vector float q,
+                                         uint unit,
+                                         vector float colors[4]);
+
+
 /** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       tile_t *colorTile,
@@ -107,10 +115,12 @@ struct spu_texture
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   vector float tex_size; /**< == {width, height, 0, 0} */
+   vector float width4;   /**< == {width, width, width, width} */
+   vector float height4;  /**< == {height, height, height, height} */
+   vector unsigned int tex_size_mask; /**< == {width-1, height-1, 0, 0 } */
+   vector unsigned int tex_size_x_mask; /**< splat(width-1) */
+   vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
 
 
@@ -159,6 +169,7 @@ struct spu_global
 
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f8..12e6ed1ba1 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <transpose_matrix4x4.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -91,10 +93,10 @@ static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
    const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);
-   vec_uint4 tile_y = spu_rlmask(y, -5);
-   const qword offset_x = si_andi((qword) x, 0x1f);
-   const qword offset_y = si_andi((qword) y, 0x1f);
+   vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
    const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
@@ -132,6 +134,31 @@ sample_texture_nearest(uint unit, vector float texcoord)
 }
 
 
+/**
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, vector float colors[4])
+{
+   vector float ss = spu_mul(s, spu.texture[unit].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].height4);
+   vector unsigned int is = spu_convtu(ss, 0);
+   vector unsigned int it = spu_convtu(tt, 0);
+   vec_uint4 texels[4];
+
+   /* GL_REPEAT wrap mode: */
+   is = spu_and(is, spu.texture[unit].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].tex_size_y_mask);
+
+   get_four_texels(unit, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
+}
+
+
 vector float
 sample_texture_bilinear(uint unit, vector float texcoord)
 {
@@ -198,3 +225,93 @@ sample_texture_bilinear(uint unit, vector float texcoord)
 
    return texel_sum;
 }
+
+
+void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4])
+{
+   vector float ss = spu_madd(s, spu.texture[unit].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
+
+   vector unsigned int is0 = spu_convtu(ss, 0);
+   vector unsigned int it0 = spu_convtu(tt, 0);
+
+   /* is + 1, it + 1 */
+   vector unsigned int is1 = spu_add(is0, 1);
+   vector unsigned int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+
+   /* XXX possibly rework following code to compute the weighted sample
+    * colors with integer arithmetic for fewer int->float conversions.
+    */
+
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be8..f019e7d8ef 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -40,8 +40,20 @@ extern vector float
 sample_texture_nearest(uint unit, vector float texcoord);
 
 
+extern void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, vector float colors[4]);
+
+
 extern vector float
 sample_texture_bilinear(uint unit, vector float texcoord);
 
 
+extern void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4]);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From c8fb3682619ea49c5fefdf8b88cdb95eac7478ff Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 11:16:04 -0600
Subject: cell: remove old texture code

---
 src/gallium/drivers/cell/spu/spu_command.c |  2 -
 src/gallium/drivers/cell/spu/spu_funcs.c   | 19 -------
 src/gallium/drivers/cell/spu/spu_main.h    |  4 --
 src/gallium/drivers/cell/spu/spu_texture.c | 88 ++----------------------------
 src/gallium/drivers/cell/spu/spu_texture.h |  8 ---
 src/gallium/drivers/cell/spu/spu_tri.c     | 67 +----------------------
 6 files changed, 7 insertions(+), 181 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c59be7defd..d4cc9a2146 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -302,11 +302,9 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
    else {
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
       spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 13c234ea2e..4c90b701ee 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -104,27 +104,8 @@ static struct vec_4x4
 spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
-   //const uint unit = 0;
    struct vec_4x4 colors;
-#if 0
-   vector float coords[4];
-
-   coords[0] = s;
-   coords[1] = t;
-   coords[2] = r;
-   coords[3] = q;
-   _transpose_matrix4x4(coords, coords);
-
-   /* get four texture samples */
-   colors.v[0] = spu.sample_texture[unit](unit, coords[0]);
-   colors.v[1] = spu.sample_texture[unit](unit, coords[1]);
-   colors.v[2] = spu.sample_texture[unit](unit, coords[2]);
-   colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
-
-   _transpose_matrix4x4(colors.v, colors.v);
-#else
    spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
-#endif
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 5d14be51c2..2a8cb00f8d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -64,9 +64,6 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
-                                                vector float texcoord);
-
 typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
@@ -168,7 +165,6 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
    spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 12e6ed1ba1..ba62ad27fd 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -120,21 +120,9 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 }
 
 
-/**
- * Get texture sample at texcoord.
- */
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
-{
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
-   uint texel = get_texel(unit, itc);
-   return spu_unpack_A8R8G8B8(texel);
-}
-
 
 /**
+ * Do nearest texture sampling for four pixels.
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
@@ -148,7 +136,7 @@ sample_texture4_nearest(vector float s, vector float t,
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
-   /* GL_REPEAT wrap mode: */
+   /* PIPE_TEX_WRAP_REPEAT */
    is = spu_and(is, spu.texture[unit].tex_size_x_mask);
    it = spu_and(it, spu.texture[unit].tex_size_y_mask);
 
@@ -159,74 +147,10 @@ sample_texture4_nearest(vector float s, vector float t,
 }
 
 
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
-{
-   static const vec_uint4 offset_x = {0, 0, 1, 1};
-   static const vec_uint4 offset_y = {0, 1, 0, 1};
-
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
-
-   /* integer texcoords S,T: */
-   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
-
-   vec_uint4 texels[4];
-   
-   /* setup texcoords for quad:
-    *  +-----+-----+
-    *  |x0,y0|x1,y1|
-    *  +-----+-----+
-    *  |x2,y2|x3,y3|
-    *  +-----+-----+
-    */
-   vec_uint4 x = spu_splats(spu_extract(itc, 0));
-   vec_uint4 y = spu_splats(spu_extract(itc, 1));
-   x = spu_add(x, offset_x);
-   y = spu_add(y, offset_y);
-
-   /* GL_REPEAT wrap mode: */
-   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
-   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
-
-   get_four_texels(unit, x, y, texels);
-
-   /* integer A8R8G8B8 to float texel conversion */
-   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
-   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
-   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
-
-
-   /* Compute weighting factors in [0,1]
-    * Multiply texcoord by 1024, AND with 1023, convert back to float.
-    */
-   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
-   vector signed int itc1024 = spu_convts(tc1024, 0);
-   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
-   vector float weight = spu_convtf(itc1024, 10);
-
-   /* smeared frac and 1-frac */
-   vector float sfrac = spu_splats(spu_extract(weight, 0));
-   vector float tfrac = spu_splats(spu_extract(weight, 1));
-   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
-   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
-   /* multiply the samples (colors) by the S/T weights */
-   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
-   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
-   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
-   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
-   /* compute sum of weighted samples */
-   vector float texel_sum = spu_add(texel00, texel01);
-   texel_sum = spu_add(texel_sum, texel10);
-   texel_sum = spu_add(texel_sum, texel11);
-
-   return texel_sum;
-}
-
-
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f019e7d8ef..d576aed719 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,20 +36,12 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
-
-
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, vector float colors[4]);
 
 
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
-
-
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a62d4f0f2f..022d21ba8f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -286,72 +286,7 @@ emit_quad( int x, int y, mask_t mask)
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (0/*spu.texture[0].start*/) {
-         /*
-          * Temporary texture mapping path
-          * This will go away when fragment programs support TEX inst.
-          */
-         const uint unit = 0;
-         vector float colors[4];
-         vector float texcoords[4];
-         eval_coeff(2, (float) x, (float) y, texcoords);
-
-         if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
-         if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
-         if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
-         if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
-         if (spu.texture[1].start) {
-            /* multi-texture mapping */
-            const uint unit = 1;
-            vector float colors1[4];
-
-            eval_coeff(2, (float) x, (float) y, texcoords);
-
-            if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
-            if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
-            if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
-            if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-            /* hack: modulate first texture by second */
-            colors[0] = spu_mul(colors[0], colors1[0]);
-            colors[1] = spu_mul(colors[1], colors1[1]);
-            colors[2] = spu_mul(colors[2], colors1[2]);
-            colors[3] = spu_mul(colors[3], colors1[3]);
-         }
-
-         {
-            /* Convert fragment data from AoS to SoA format.
-             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-             * This is temporary!
-             */
-            vector float soa_frag[4];
-            _transpose_matrix4x4(soa_frag, colors);
-
-            vector float fragZ = eval_z((float) x, (float) y);
-
-            /* Do all per-fragment/quad operations here, including:
-             * alpha test, z test, stencil test, blend and framebuffer writing.
-             */
-            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                             fragZ,
-                             soa_frag[0], soa_frag[1],
-                             soa_frag[2], soa_frag[3],
-                             mask,
-                             setup.facing);
-         }
-
-      }
-      else {
+      {
          /*
           * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
-- 
cgit v1.2.3


From b0c136cfb1fcbcea35e17dc699a96acbb24738f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 15:17:01 -0600
Subject: cell: remove old texture-related fields

---
 src/gallium/drivers/cell/spu/spu_command.c | 3 ---
 src/gallium/drivers/cell/spu/spu_main.h    | 2 --
 2 files changed, 5 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d4cc9a2146..64890f6dbd 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -330,9 +330,6 @@ cmd_state_texture(const struct cell_command_texture *texture)
 
    spu.texture[unit].tiles_per_row = width / TILE_SIZE;
 
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
    spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
    spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2a8cb00f8d..e3960dbe8b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -112,10 +112,8 @@ struct spu_texture
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float tex_size; /**< == {width, height, 0, 0} */
    vector float width4;   /**< == {width, width, width, width} */
    vector float height4;  /**< == {height, height, height, height} */
-   vector unsigned int tex_size_mask; /**< == {width-1, height-1, 0, 0 } */
    vector unsigned int tex_size_x_mask; /**< splat(width-1) */
    vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
-- 
cgit v1.2.3


From 978799beb2a9c51550abb1f37bb6f63d06bc4717 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 16:43:11 -0600
Subject: cell: initial work for mipmap texture filtering

---
 src/gallium/drivers/cell/common.h              |   6 +-
 src/gallium/drivers/cell/ppu/cell_screen.c     |   4 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  18 ++--
 src/gallium/drivers/cell/ppu/cell_texture.c    |  48 ++++++----
 src/gallium/drivers/cell/ppu/cell_texture.h    |   6 +-
 src/gallium/drivers/cell/spu/spu_command.c     |  37 +++++---
 src/gallium/drivers/cell/spu/spu_funcs.c       |   1 +
 src/gallium/drivers/cell/spu/spu_main.h        |   7 +-
 src/gallium/drivers/cell/spu/spu_texture.c     | 120 ++++++++++++++++++-------
 src/gallium/drivers/cell/spu/spu_texture.h     |   6 ++
 10 files changed, 176 insertions(+), 77 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 5dc756023f..e4de9a551d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -67,6 +67,7 @@
 #define CELL_MAX_SPUS 6
 
 #define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
 
 #define TILE_SIZE 32
 
@@ -251,8 +252,9 @@ struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
    uint unit;
-   void *start;         /**< Address in main memory */
-   ushort width, height;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 47ba6fa290..d223557950 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -76,11 +76,11 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
       return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
       return 8;  /* max 128x128x128 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    default:
       return 10;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index cbfa393cfb..7090b4c99f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -211,14 +211,20 @@ cell_emit_state(struct cell_context *cell)
          texture->opcode = CELL_CMD_STATE_TEXTURE;
          texture->unit = i;
          if (cell->texture[i]) {
-            texture->start = cell->texture[i]->tiled_data;
-            texture->width = cell->texture[i]->base.width[0];
-            texture->height = cell->texture[i]->base.height[0];
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = cell->texture[i]->tiled_data[level];
+               texture->width[level] = cell->texture[i]->base.width[level];
+               texture->height[level] = cell->texture[i]->base.height[level];
+            }
          }
          else {
-            texture->start = NULL;
-            texture->width = 1;
-            texture->height = 1;
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = NULL;
+               texture->width[level] = 1;
+               texture->height[level] = 1;
+            }
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86..f5f81ac3cc 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -66,6 +66,8 @@ cell_texture_layout(struct cell_texture * spt)
       unsigned size;
       unsigned w_tile, h_tile;
 
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
       /* width, height, rounded up to tile size */
       w_tile = align(width, TILE_SIZE);
       h_tile = align(height, TILE_SIZE);
@@ -249,33 +251,41 @@ cell_tile_texture(struct cell_context *cell,
                   struct cell_texture *texture)
 {
    struct pipe_screen *screen = cell->pipe.screen;
-   uint face = 0, level = 0, zslice = 0;
-   struct pipe_surface *surf;
-   const uint w = texture->base.width[0], h = texture->base.height[0];
+   uint face = 0, level, zslice = 0;
    const uint *src;
 
-   /* temporary restrictions: */
-   assert(w >= TILE_SIZE);
-   assert(h >= TILE_SIZE);
-   assert(w % TILE_SIZE == 0);
-   assert(h % TILE_SIZE == 0);
+   for (level = 0; level <= texture->base.last_level; level++) {
+      if (!texture->tiled_data[level]) {
+         struct pipe_surface *surf;
 
-   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
-                                  PIPE_BUFFER_USAGE_CPU_WRITE);
-   ASSERT(surf);
+         const uint w = texture->base.width[level], h = texture->base.height[level];
 
-   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
+         if (w < 32 || h < 32)
+            continue;
+         /* temporary restrictions: */
+         assert(w >= TILE_SIZE);
+         assert(h >= TILE_SIZE);
+         assert(w % TILE_SIZE == 0);
+         assert(h % TILE_SIZE == 0);
 
-   if (texture->tiled_data) {
-      align_free(texture->tiled_data);
-   }
-   texture->tiled_data = align_malloc(w * h * 4, 16);
+         surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+         ASSERT(surf);
+         
+         src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
 
-   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+         if (texture->tiled_data[level]) {
+            align_free(texture->tiled_data[level]);
+         }
+         texture->tiled_data[level] = align_malloc(w * h * 4, 16);
 
-   pipe_surface_unmap(surf);
+         tile_copy_data(w, h, TILE_SIZE, texture->tiled_data[level], src);
 
-   pipe_surface_reference(&surf, NULL);
+         pipe_surface_unmap(surf);
+
+         pipe_surface_reference(&surf, NULL);
+      }
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebc..6d35736984 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,15 @@ struct cell_texture
 {
    struct pipe_texture base;
 
-   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
+   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 64890f6dbd..089af22415 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,6 +301,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
+#if 0
+   if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      spu.sample_texture4[sampler->unit] = sample_texture4_lod;
+   }
+   else
+#endif
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
@@ -314,24 +320,29 @@ static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
    const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
+   uint i;
 
-   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
-             texture->unit, texture->start,
-             texture->width, texture->height);
+   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
 
-   spu.texture[unit].width4 = spu_splats((float) width);
-   spu.texture[unit].height4 = spu_splats((float) height);
+      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
 
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
 
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE;
+
+      spu.texture[unit].level[i].width4 = spu_splats((float) width);
+      spu.texture[unit].level[i].height4 = spu_splats((float) height);
+
+      spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
+      spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 4c90b701ee..f2946010bd 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -100,6 +100,7 @@ spu_log2(vector float x)
    return spu_mul(v, k);
 }
 
+
 static struct vec_4x4
 spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e3960dbe8b..9515543efe 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,7 +107,7 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
-struct spu_texture
+struct spu_texture_level
 {
    void *start;
    ushort width, height;
@@ -118,6 +118,11 @@ struct spu_texture
    vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
 
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+} ALIGN16_ATTRIB;
+
 
 /**
  * All SPU global/context state will be in a singleton object of this type:
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 96ef88822a..96c09e3ccb 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -27,6 +27,7 @@
 
 
 #include <transpose_matrix4x4.h>
+#include <math.h>
 
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
@@ -42,11 +43,12 @@
 void
 invalidate_tex_cache(void)
 {
+   uint lvl = 0;
    uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].width
-      * spu.texture[unit].height;
+   uint bytes = 4 * spu.texture[unit].level[lvl].width
+      * spu.texture[unit].level[lvl].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
+   spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
 }
 
 
@@ -64,15 +66,17 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
+                vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   const unsigned texture_ea = (uintptr_t) tlevel->start;
    vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -104,17 +108,18 @@ sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, vector float colors[4])
 {
-   vector float ss = spu_mul(s, spu.texture[unit].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].height4);
+   const uint lvl = 0;
+   vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4);
    vector unsigned int is = spu_convtu(ss, 0);
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].tex_size_y_mask);
+   is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask);
 
-   get_four_texels(unit, is, it, texels);
+   get_four_texels(unit, lvl, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -130,8 +135,9 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4])
 {
-   vector float ss = spu_madd(s, spu.texture[unit].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
+   const uint lvl = 0;
+   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f));
 
    vector unsigned int is0 = spu_convtu(ss, 0);
    vector unsigned int it0 = spu_convtu(tt, 0);
@@ -141,17 +147,17 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -270,10 +276,11 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4])
 {
+   const uint lvl = 0;
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].height4, half);
+   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
    vector unsigned int is = spu_convtu(ss, 8);
@@ -294,17 +301,17 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -363,3 +370,54 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[3] = spu_convtf(cSum, 24);
 }
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static float
+compute_lambda(uint unit, vector float s, vector float t)
+{
+   uint lvl = 0;
+   float width = spu.texture[unit].level[lvl].width;
+   float height = spu.texture[unit].level[lvl].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+   float lambda = logf(rho) * 1.442695f;
+   return lambda;
+}
+
+
+
+/**
+ * Texture sampling with level of detail selection.
+ */
+void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, vector float colors[4])
+{
+   float lambda = compute_lambda(unit, s, t);
+
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   /* hack for now */
+   int level = (int) lambda;
+   if (level > 3)
+      level = 3;
+
+   /*
+   sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
+   */
+}
+
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 38a17deda2..4802f7c47c 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -53,4 +53,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                          uint unit, vector float colors[4]);
 
 
+extern void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, vector float colors[4]);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From f8bddf698d523f597fea0f721b064daee81d8005 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:11:52 -0600
Subject: cell: basic mipmap filtering works now

Though, only GL_MIPMAP_NEAREST / GL_LINEAR works right now.
---
 src/gallium/drivers/cell/spu/spu_command.c |  21 ++++--
 src/gallium/drivers/cell/spu/spu_funcs.c   |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h    |   3 +-
 src/gallium/drivers/cell/spu/spu_texture.c | 106 +++++++++++++++--------------
 src/gallium/drivers/cell/spu/spu_texture.h |   8 +--
 5 files changed, 79 insertions(+), 61 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 089af22415..4e98eea338 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,16 +301,18 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-#if 0
+
    if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* use lambda/lod to determine min vs. mag filter */
       spu.sample_texture4[sampler->unit] = sample_texture4_lod;
    }
-   else
-#endif
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+   else if (spu.sampler[sampler->unit].min_img_filter
+            == PIPE_TEX_FILTER_LINEAR) {
+      /* min = mag = bilinear */
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
    else {
+      /* min = mag = inearest */
       spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
    }
 }
@@ -322,8 +324,12 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint unit = texture->unit;
    uint i;
 
+   //if (spu.init.id==0) Debug=1;
+
    DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
+   spu.texture[unit].max_level = 0;
+
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
       uint height = texture->height[i];
@@ -335,14 +341,19 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].width = width;
       spu.texture[unit].level[i].height = height;
 
-      spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
 
       spu.texture[unit].level[i].width4 = spu_splats((float) width);
       spu.texture[unit].level[i].height4 = spu_splats((float) height);
 
       spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
       spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
    }
+   //Debug=0;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index f2946010bd..66b82f673d 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v);
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 9515543efe..cfb645add0 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
                                          vector float q,
-                                         uint unit,
+                                         uint unit, uint level,
                                          vector float colors[4]);
 
 
@@ -121,6 +121,7 @@ struct spu_texture_level
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 96c09e3ccb..10036330c6 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 
-#include <transpose_matrix4x4.h>
 #include <math.h>
 
 #include "pipe/p_compiler.h"
@@ -43,12 +42,14 @@
 void
 invalidate_tex_cache(void)
 {
-   uint lvl = 0;
-   uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].level[lvl].width
-      * spu.texture[unit].level[lvl].height;
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
 }
 
 
@@ -71,8 +72,8 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    const unsigned texture_ea = (uintptr_t) tlevel->start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
-   vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
@@ -106,20 +107,19 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, vector float colors[4])
+                        uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
-   vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4);
+   vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
    vector unsigned int is = spu_convtu(ss, 0);
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
 
-   get_four_texels(unit, lvl, is, it, texels);
+   get_four_texels(unit, level, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -133,11 +133,10 @@ sample_texture4_nearest(vector float s, vector float t,
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4])
+                         uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
-   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f));
+   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
 
    vector unsigned int is0 = spu_convtu(ss, 0);
    vector unsigned int it0 = spu_convtu(tt, 0);
@@ -147,17 +146,17 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -273,14 +272,13 @@ transpose(vector unsigned int *mOut0,
  */
 void
 sample_texture4_bilinear_2(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, vector float colors[4])
+                           vector float r, vector float q,
+                           uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half);
+   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
    vector unsigned int is = spu_convtu(ss, 8);
@@ -301,17 +299,17 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -379,9 +377,9 @@ sample_texture4_bilinear_2(vector float s, vector float t,
 static float
 compute_lambda(uint unit, vector float s, vector float t)
 {
-   uint lvl = 0;
-   float width = spu.texture[unit].level[lvl].width;
-   float height = spu.texture[unit].level[lvl].width;
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
    float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
@@ -402,22 +400,30 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, vector float colors[4])
+                    uint unit, uint level, vector float colors[4])
 {
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
    float lambda = compute_lambda(unit, s, t);
 
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
    if (lambda < spu.sampler[unit].min_lod)
       lambda = spu.sampler[unit].min_lod;
    else if (lambda > spu.sampler[unit].max_lod)
       lambda = spu.sampler[unit].max_lod;
 
-   /* hack for now */
-   int level = (int) lambda;
-   if (level > 3)
-      level = 3;
+   /* convert to int level */
+   level = (int) (lambda + 0.5f);
+   ASSERT(level >= 0);
+
+   if (level > spu.texture[unit].max_level)
+      level = spu.texture[unit].max_level;
 
-   /*
    sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
-   */
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 4802f7c47c..ec06a50b4a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -39,24 +39,24 @@ invalidate_tex_cache(void);
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, vector float colors[4]);
+                        uint unit, uint level, vector float colors[4]);
 
 
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4]);
+                         uint unit, uint level, vector float colors[4]);
 
 extern void
 sample_texture4_bilinear_2(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4]);
+                           uint unit, uint level, vector float colors[4]);
 
 
 extern void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, vector float colors[4]);
+                    uint unit, uint level, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 6d2d5ceca21c87bea5e269e8099fb6f1d821b97a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:42:21 -0600
Subject: cell: use minify vs magnify filters

---
 src/gallium/drivers/cell/spu/spu_command.c | 50 +++++++++++++++++++++++-------
 src/gallium/drivers/cell/spu/spu_main.h    |  2 ++
 src/gallium/drivers/cell/spu/spu_texture.c | 22 +++++++------
 3 files changed, 53 insertions(+), 21 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 4e98eea338..fa78377c66 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -298,22 +298,48 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
-   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+   uint unit = sampler->unit;
 
-   spu.sampler[sampler->unit] = sampler->state;
+   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
 
-   if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
-      /* use lambda/lod to determine min vs. mag filter */
-      spu.sample_texture4[sampler->unit] = sample_texture4_lod;
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
    }
-   else if (spu.sampler[sampler->unit].min_img_filter
-            == PIPE_TEX_FILTER_LINEAR) {
-      /* min = mag = bilinear */
-      spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
    }
-   else {
-      /* min = mag = inearest */
-      spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture4[unit] = sample_texture4_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      break;
+   default:
+      ASSERT(0);
    }
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index cfb645add0..56aac655e9 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -170,6 +170,8 @@ struct spu_global
 
    /** Current texture sampler function */
    spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 10036330c6..267f2302f6 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -400,7 +400,7 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level, vector float colors[4])
+                    uint unit, uint level_ignored, vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -417,13 +417,17 @@ sample_texture4_lod(vector float s, vector float t,
    else if (lambda > spu.sampler[unit].max_lod)
       lambda = spu.sampler[unit].max_lod;
 
-   /* convert to int level */
-   level = (int) (lambda + 0.5f);
-   ASSERT(level >= 0);
-
-   if (level > spu.texture[unit].max_level)
-      level = spu.texture[unit].max_level;
-
-   sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
+   }
+   else {
+      /* minify */
+      int level = (int) (lambda + 0.5f);
+      if (level > (int) spu.texture[unit].max_level)
+         level = spu.texture[unit].max_level;
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
+      /* XXX to do: mipmap level interpolation */
+   }
 }
 
-- 
cgit v1.2.3


From 85dc1aec9c5fc63a01bb8db07215b84790d15d8f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 15:19:01 -0600
Subject: cell: support NPOT textures, clamp/repeat mode, normalized/unorm
 texcoords

glDrawPixels works now.
---
 src/gallium/drivers/cell/spu/spu_command.c | 48 +++++++++++++--
 src/gallium/drivers/cell/spu/spu_main.h    | 12 ++--
 src/gallium/drivers/cell/spu/spu_texture.c | 99 ++++++++++++++++++++----------
 3 files changed, 117 insertions(+), 42 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index fa78377c66..b1efe97e76 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -295,6 +295,42 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -341,6 +377,8 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    default:
       ASSERT(0);
    }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
@@ -370,15 +408,15 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
-      spu.texture[unit].level[i].width4 = spu_splats((float) width);
-      spu.texture[unit].level[i].height4 = spu_splats((float) height);
-
-      spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
-      spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
 
       if (texture->start[i])
          spu.texture[unit].max_level = i;
    }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+
    //Debug=0;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 56aac655e9..45c6f4ced1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,17 +107,21 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
+/** per-texture level info */
 struct spu_texture_level
 {
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float width4;   /**< == {width, width, width, width} */
-   vector float height4;  /**< == {height, height, height, height} */
-   vector unsigned int tex_size_x_mask; /**< splat(width-1) */
-   vector unsigned int tex_size_y_mask; /**< splat(height-1) */
+   /** texcoord scale factors */
+   vector float scale_s, scale_t;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t;
 } ALIGN16_ATTRIB;
 
+
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 83cf7dc394..b21c43a467 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -67,13 +67,13 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
+get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    const unsigned texture_ea = (uintptr_t) tlevel->start;
-   const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
-   const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
@@ -99,6 +99,20 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 }
 
 
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
 
 /**
  * Do nearest texture sampling for four pixels.
@@ -109,15 +123,20 @@ sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, uint level, vector float colors[4])
 {
-   vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
-   vector unsigned int is = spu_convtu(ss, 0);
-   vector unsigned int it = spu_convtu(tt, 0);
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
 
    get_four_texels(unit, level, is, it, texels);
 
@@ -135,21 +154,28 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, uint level, vector float colors[4])
 {
-   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_madd(s, tlevel->scale_s,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f));
 
-   vector unsigned int is0 = (vector unsigned int) spu_convts(ss, 0);
-   vector unsigned int it0 = (vector unsigned int) spu_convts(tt, 0);
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
 
    /* is + 1, it + 1 */
-   vector unsigned int is1 = spu_add(is0, 1);
-   vector unsigned int it1 = spu_add(it0, 1);
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
    /* get packed int texels */
    vector unsigned int texels[16];
@@ -275,34 +301,41 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                            vector float r, vector float q,
                            uint unit, uint level, vector float colors[4])
 {
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
-   vector unsigned int is = (vector unsigned int) spu_convts(ss, 8);
-   vector unsigned int it = (vector unsigned int) spu_convts(tt, 8);
+   vector signed int is = spu_convts(ss, 8);
+   vector signed int it = spu_convts(tt, 8);
 
    /* compute integer texel weights in [0, 255] */
-   vector signed int sWeights0 = spu_and((vector signed int) is, 255);
-   vector signed int tWeights0 = spu_and((vector signed int) it, 255);
+   vector signed int sWeights0 = spu_and(is, 255);
+   vector signed int tWeights0 = spu_and(it, 255);
    vector signed int sWeights1 = spu_sub(255, sWeights0);
    vector signed int tWeights1 = spu_sub(255, tWeights0);
 
    /* texel coords: is0 = is / 256, it0 = is / 256 */
-   vector unsigned int is0 = spu_rlmask(is, -8);
-   vector unsigned int it0 = spu_rlmask(it, -8);
+   vector signed int is0 = spu_rlmask(is, -8);
+   vector signed int it0 = spu_rlmask(it, -8);
 
    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
-   vector unsigned int is1 = spu_add(is0, 1);
-   vector unsigned int it1 = spu_add(it0, 1);
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-- 
cgit v1.2.3


From 8f7c6b55ae962e30f32cfec9a14a652d3b5b5943 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:11:29 -0600
Subject: cell: support for cubemaps

Though, progs/demos/cubemap.c doesn't quite work right...
---
 src/gallium/drivers/cell/common.h              |   1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   2 +
 src/gallium/drivers/cell/ppu/cell_texture.c    |  37 ++++--
 src/gallium/drivers/cell/spu/spu_command.c     |  17 ++-
 src/gallium/drivers/cell/spu/spu_funcs.c       |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h        |   4 +-
 src/gallium/drivers/cell/spu/spu_texture.c     | 171 ++++++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_texture.h     |  21 ++-
 8 files changed, 214 insertions(+), 41 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index e4de9a551d..c1e78f4db3 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -251,6 +251,7 @@ struct cell_command_sampler
 struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
    ushort width[CELL_MAX_TEXTURE_LEVELS];
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index cae546b700..d4a867ffcf 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -217,6 +217,7 @@ cell_emit_state(struct cell_context *cell)
                texture->width[level] = cell->texture[i]->base.width[level];
                texture->height[level] = cell->texture[i]->base.height[level];
             }
+            texture->target = cell->texture[i]->base.target;
          }
          else {
             uint level;
@@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell)
                texture->width[level] = 0;
                texture->height[level] = 0;
             }
+            texture->target = 0;
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 4fd66bdea0..4c92ef154f 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -137,6 +137,7 @@ cell_texture_release(struct pipe_screen *screen,
    */
    if (--(*pt)->refcount <= 0) {
       struct cell_texture *ct = cell_texture(*pt);
+      uint i;
 
       /*
       DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
@@ -144,6 +145,12 @@ cell_texture_release(struct pipe_screen *screen,
 
       pipe_buffer_reference(screen, &ct->buffer, NULL);
 
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         if (ct->tiled_data[i]) {
+            FREE(ct->tiled_data[i]);
+         }
+      }
+
       FREE(ct);
    }
    *pt = NULL;
@@ -204,27 +211,33 @@ static void
 cell_twiddle_texture(struct pipe_screen *screen,
                      struct pipe_surface *surface)
 {
-   struct cell_texture *texture = cell_texture(surface->texture);
+   struct cell_texture *ct = cell_texture(surface->texture);
    const uint level = surface->level;
-   const uint texWidth = texture->base.width[level];
-   const uint texHeight = texture->base.height[level];
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
    const uint bufWidth = align(texWidth, TILE_SIZE);
    const uint bufHeight = align(texHeight, TILE_SIZE);
    const void *map = pipe_buffer_map(screen, surface->buffer,
                                      PIPE_BUFFER_USAGE_CPU_READ);
    const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
 
-   switch (texture->base.format) {
+   switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      /* free old tiled data */
-      if (texture->tiled_data[level]) {
-         align_free(texture->tiled_data[level]);
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_data[level]) {
+            ct->tiled_data[level] =
+               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
       }
-      /* alloc new tiled data */
-      texture->tiled_data[level] = align_malloc(bufWidth * bufHeight * 4, 16);
-      twiddle_image_uint(texWidth, texHeight, TILE_SIZE,
-                         texture->tiled_data[level],
-                         surface->stride, src);
       break;
    default:
       printf("Cell: twiddle unsupported texture format\n");
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index b1efe97e76..c951fa6f31 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,7 +301,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
  */
 static void
 update_tex_masks(struct spu_texture *texture,
-                 const struct pipe_sampler_state *sampler)
+                 const struct pipe_sampler_state *sampler,
+                 uint unit)
 {
    uint i;
 
@@ -328,6 +329,11 @@ update_tex_masks(struct spu_texture *texture,
          texture->level[i].scale_t = spu_splats(1.0f);
       }
    }
+
+   /* XXX temporary hack */
+   if (texture->target == PIPE_TEXTURE_CUBE) {
+      spu.sample_texture4[unit] = sample_texture4_cube;
+   }
 }
 
 
@@ -378,7 +384,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
       ASSERT(0);
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 }
 
 
@@ -393,6 +399,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
    spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
 
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
@@ -408,6 +415,10 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1))
+         * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1));
+
       spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
       spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
 
@@ -415,7 +426,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
          spu.texture[unit].max_level = i;
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 
    //Debug=0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 66b82f673d..5c3ee305d4 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v);
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 45c6f4ced1..8781041bff 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
                                          vector float q,
-                                         uint unit, uint level,
+                                         uint unit, uint level, uint face,
                                          vector float colors[4]);
 
 
@@ -113,6 +113,7 @@ struct spu_texture_level
    void *start;
    ushort width, height;
    ushort tiles_per_row;
+   uint bytes_per_image;
    /** texcoord scale factors */
    vector float scale_s, scale_t;
    /** texcoord masks (if REPEAT then size-1, else ~0) */
@@ -126,6 +127,7 @@ struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
    uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index b21c43a467..2570f02c73 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -48,6 +48,9 @@ invalidate_tex_cache(void)
       uint bytes = 4 * spu.texture[unit].level[lvl].width
          * spu.texture[unit].level[lvl].height;
 
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+
       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
    }
 }
@@ -67,11 +70,11 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
+get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
-   const unsigned texture_ea = (uintptr_t) tlevel->start;
+   unsigned texture_ea = (uintptr_t) tlevel->start;
    const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
@@ -88,6 +91,8 @@ get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
    
    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
    
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
    spu_dcache_fetch_unaligned((qword *) & texels[0],
                               texture_ea + spu_extract(offset, 0), 4);
    spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -121,7 +126,8 @@ spu_clamp(vector signed int vec, vector signed int max)
 void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, uint level, vector float colors[4])
+                        uint unit, uint level, uint face,
+                        vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    vector float ss = spu_mul(s, tlevel->scale_s);
@@ -138,7 +144,7 @@ sample_texture4_nearest(vector float s, vector float t,
    is = spu_clamp(is, tlevel->max_s);
    it = spu_clamp(it, tlevel->max_t);
 
-   get_four_texels(unit, level, is, it, texels);
+   get_four_texels(unit, level, face, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -152,11 +158,14 @@ sample_texture4_nearest(vector float s, vector float t,
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, uint level, vector float colors[4])
+                         uint unit, uint level, uint face,
+                         vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
-   vector float ss = spu_madd(s, tlevel->scale_s,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f));
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
    vector signed int is0 = spu_convts(ss, 0);
    vector signed int it0 = spu_convts(tt, 0);
@@ -179,10 +188,10 @@ sample_texture4_bilinear(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -299,10 +308,12 @@ transpose(vector unsigned int *mOut0,
 void
 sample_texture4_bilinear_2(vector float s, vector float t,
                            vector float r, vector float q,
-                           uint unit, uint level, vector float colors[4])
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
    /* Scale texcoords by size of texture, and add half pixel bias */
    vector float ss = spu_madd(s, tlevel->scale_s, half);
    vector float tt = spu_madd(t, tlevel->scale_t, half);
@@ -339,10 +350,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -433,7 +444,8 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level_ignored, vector float colors[4])
+                    uint unit, uint level_ignored, uint face,
+                    vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -452,15 +464,136 @@ sample_texture4_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
    }
    else {
       /* minify */
       int level = (int) (lambda + 0.5f);
       if (level > (int) spu.texture[unit].max_level)
          level = spu.texture[unit].max_level;
-      spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
       /* XXX to do: mipmap level interpolation */
    }
 }
 
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level, int face_ignored,
+                     vector float colors[4])
+{
+   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
+   uint p, faces[4];
+   float newS[4], newT[4];
+
+   /* Compute cube face referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                 zero, zero, unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index ec06a50b4a..08b891a4a8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -39,24 +39,35 @@ invalidate_tex_cache(void);
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, uint level, vector float colors[4]);
+                        uint unit, uint level, uint face,
+                        vector float colors[4]);
 
 
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, uint level, vector float colors[4]);
+                         uint unit, uint level, uint face,
+                         vector float colors[4]);
 
 extern void
 sample_texture4_bilinear_2(vector float s, vector float t,
-                         vector float r, vector float q,
-                           uint unit, uint level, vector float colors[4]);
+                           vector float r, vector float q,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
 
 
 extern void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level, vector float colors[4]);
+                    uint unit, uint level, uint face,
+                    vector float colors[4]);
+
+
+extern void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level_ignored, int face_ignored,
+                     vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 41ccdde767e7aba6e8e6a9a035eacd6338c03a95 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:22:40 -0600
Subject: cell: initial bits for 3D texture support

---
 src/gallium/drivers/cell/common.h              |  1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  2 ++
 src/gallium/drivers/cell/spu/spu_command.c     | 13 +++++++++++--
 src/gallium/drivers/cell/spu/spu_main.h        |  8 ++++----
 src/gallium/drivers/cell/spu/spu_texture.c     |  2 ++
 5 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c1e78f4db3..b0169b8e32 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -256,6 +256,7 @@ struct cell_command_texture
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
    ushort width[CELL_MAX_TEXTURE_LEVELS];
    ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index d4a867ffcf..bb694aa107 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -216,6 +216,7 @@ cell_emit_state(struct cell_context *cell)
                texture->start[level] = cell->texture[i]->tiled_data[level];
                texture->width[level] = cell->texture[i]->base.width[level];
                texture->height[level] = cell->texture[i]->base.height[level];
+               texture->depth[level] = cell->texture[i]->base.depth[level];
             }
             texture->target = cell->texture[i]->base.target;
          }
@@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell)
                texture->start[level] = NULL;
                texture->width[level] = 0;
                texture->height[level] = 0;
+               texture->depth[level] = 0;
             }
             texture->target = 0;
          }
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c951fa6f31..c28677ebf8 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -59,6 +59,14 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
 
 
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -404,6 +412,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
       uint height = texture->height[i];
+      uint depth = texture->depth[i];
 
       DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
              texture->start[i], texture->width[i], texture->height[i]);
@@ -411,13 +420,13 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].start = texture->start[i];
       spu.texture[unit].level[i].width = width;
       spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
 
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
       spu.texture[unit].level[i].bytes_per_image =
-         4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1))
-         * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1));
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
 
       spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
       spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 8781041bff..eff43b870c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -111,15 +111,15 @@ struct spu_framebuffer
 struct spu_texture_level
 {
    void *start;
-   ushort width, height;
+   ushort width, height, depth;
    ushort tiles_per_row;
    uint bytes_per_image;
    /** texcoord scale factors */
-   vector float scale_s, scale_t;
+   vector float scale_s, scale_t, scale_r;
    /** texcoord masks (if REPEAT then size-1, else ~0) */
-   vector signed int mask_s, mask_t;
+   vector signed int mask_s, mask_t, mask_r;
    /** texcoord clamp limits */
-   vector signed int max_s, max_t;
+   vector signed int max_s, max_t, max_r;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 9e25094d13..42eb06a362 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -50,6 +50,8 @@ invalidate_tex_cache(void)
 
       if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
          bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
 
       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
    }
-- 
cgit v1.2.3


From 53951531ae7bfd64afae1ae55aac7f6ebd3fe4f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 12:35:51 -0600
Subject: cell: propogate blend color to SPUs for the fallback fragment ops
 code

---
 src/gallium/drivers/cell/common.h                  |  4 ++
 src/gallium/drivers/cell/ppu/cell_context.h        |  1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  1 +
 src/gallium/drivers/cell/spu/spu_command.c         |  1 +
 src/gallium/drivers/cell/spu/spu_main.h            |  1 +
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 75 +++++++++++++++++++---
 6 files changed, 74 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index b0169b8e32..3b5a25e165 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -118,12 +118,16 @@
 
 /**
  * Command to specify per-fragment operations state and generated code.
+ * Note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case).
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 80a9b3d7e1..1fcf03c2b8 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -74,6 +74,7 @@ struct cell_fragment_shader_state
 struct cell_fragment_ops_key
 {
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_depth_stencil_alpha_state dsa;
    enum pipe_format color_format;
    enum pipe_format zs_format;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index bb694aa107..d2427584ba 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -52,6 +52,7 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    memset(&key, 0, sizeof(key));
    key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
    key.dsa = *cell->depth_stencil;
 
    if (cell->framebuffer.cbufs[0])
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a07b312111..b521c3aecf 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -195,6 +195,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
    /* Parity twist!  For now, always use the fallback code by default,
     * only switching to codegen when specifically requested.  This
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index eff43b870c..ca72baea8b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -145,6 +145,7 @@ struct spu_global
    struct spu_framebuffer fb;
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 9404704abf..f8ffc70492 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -260,7 +260,7 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
 
       /*
-       * Compute Src RGB terms
+       * Compute Src RGB terms (fragment color * factor)
        */
       switch (spu.blend.rgb_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -283,13 +283,33 @@ spu_fallback_fragment_ops(uint x, uint y,
          term1g = spu_mul(fragG, fragA);
          term1b = spu_mul(fragB, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Src Alpha term
+       * Compute Src Alpha term (fragment alpha * factor)
        */
       switch (spu.blend.alpha_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -301,13 +321,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLENDFACTOR_SRC_ALPHA:
          term1a = spu_mul(fragA, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest RGB terms
+       * Compute Dest RGB terms (framebuffer color * factor)
        */
       switch (spu.blend.rgb_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -337,17 +367,37 @@ spu_fallback_fragment_ops(uint x, uint y,
          term2g = spu_mul(fbRGBA[1], tmp);
          term2b = spu_mul(fbRGBA[2], tmp);
          break;
-      /* XXX more cases */
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest Alpha term
+       * Compute Dest Alpha term (framebuffer alpha * factor)
        */
       switch (spu.blend.alpha_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2a = fragA;
+         term2a = fbRGBA[3];
          break;
       case PIPE_BLENDFACTOR_SRC_COLOR:
          term2a = spu_splats(0.0f);
@@ -360,6 +410,16 @@ spu_fallback_fragment_ops(uint x, uint y,
          tmp = spu_sub(one, fragA);
          term2a = spu_mul(fbRGBA[3], tmp);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
@@ -394,9 +454,7 @@ spu_fallback_fragment_ops(uint x, uint y,
          fragG = spu_max(term1g, term2g);
          fragB = spu_max(term1b, term2b);
          break;
-      /* XXX more cases */
       default:
-         printf("unsup 0x%x\n", spu.blend.rgb_func);
          ASSERT(0);
       }
 
@@ -419,7 +477,6 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLEND_MAX:
          fragA = spu_max(term1a, term2a);
          break;
-      /* XXX more cases */
       default:
          ASSERT(0);
       }
-- 
cgit v1.2.3


From ddeec1ed10d6c12403fe8d30c072ea68f044db99 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:55:18 -0600
Subject: cell: simplify spu debug code

---
 src/gallium/drivers/cell/common.h           |  1 +
 src/gallium/drivers/cell/ppu/cell_context.c |  1 +
 src/gallium/drivers/cell/spu/spu_command.c  | 47 +++++++++++++----------------
 src/gallium/drivers/cell/spu/spu_debug.h    |  9 ------
 src/gallium/drivers/cell/spu/spu_main.c     |  9 +-----
 src/gallium/drivers/cell/spu/spu_main.h     | 15 +++++++--
 src/gallium/drivers/cell/spu/spu_render.c   |  7 +++--
 7 files changed, 41 insertions(+), 48 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 3b5a25e165..8ae78265f2 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -111,6 +111,7 @@
 #define CELL_DEBUG_SYNC                 (1 << 2)
 #define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
 #define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index b66aa9c9d9..f8d5eef3ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -93,6 +93,7 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
    {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
    {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index b521c3aecf..ebbed3d1dc 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -44,7 +44,6 @@
 #include "spu_tile.h"
 #include "spu_vertex_shader.h"
 #include "spu_dcache.h"
-#include "spu_debug.h"
 #include "cell/common.h"
 
 
@@ -97,7 +96,7 @@ release_buffer(uint buffer)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
@@ -165,14 +164,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
 #endif /* CLEAR_OPT */
 
-   DEBUG_PRINTF("CLEAR SURF done\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
 }
 
 
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
-   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
    ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
@@ -189,7 +188,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
    static int warned = 0;
 
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
@@ -229,7 +228,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 {
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_program_code, fp->code,
           SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -247,11 +246,11 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
    const float *constants = (const float *) &buffer[pos + 2];
    uint i;
 
-   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
 
    /* Expand each float to float[4] for SOA execution */
    for (i = 0; i < num_const; i++) {
-      DEBUG_PRINTF("  const[%u] = %f\n", i, constants[i]);
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
       spu.constants[i] = spu_splats(constants[i]);
    }
 
@@ -263,7 +262,7 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
-   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
              cmd->width,
              cmd->height,
              cmd->color_start,
@@ -352,7 +351,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
    uint unit = sampler->unit;
 
-   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
 
    spu.sampler[unit] = sampler->state;
 
@@ -404,9 +403,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint unit = texture->unit;
    uint i;
 
-   //if (spu.init.id==0) Debug=1;
-
-   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
 
    spu.texture[unit].max_level = 0;
    spu.texture[unit].target = texture->target;
@@ -416,7 +413,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
       uint height = texture->height[i];
       uint depth = texture->depth[i];
 
-      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
              texture->start[i], texture->width[i], texture->height[i]);
 
       spu.texture[unit].level[i].start = texture->start[i];
@@ -438,15 +435,13 @@ cmd_state_texture(const struct cell_command_texture *texture)
    }
 
    update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
-
-   //Debug=0;
 }
 
 
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
    ASSERT(vinfo->num_attribs >= 1);
    ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -485,7 +480,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
 static void
 cmd_finish(void)
 {
-   DEBUG_PRINTF("FINISH\n");
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
    really_clear_tiles(0);
    /* wait for all outstanding DMAs to finish */
    mfc_write_tag_mask(~0);
@@ -510,7 +505,7 @@ cmd_batch(uint opcode)
    const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
-   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
              buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
@@ -530,7 +525,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   DEBUG_PRINTF("release batch buf %u\n", buf);
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
    release_buffer(buf);
 
    /*
@@ -663,7 +658,7 @@ cmd_batch(uint opcode)
       }
    }
 
-   DEBUG_PRINTF("BATCH complete\n");
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
 }
 
 
@@ -677,7 +672,7 @@ command_loop(void)
    struct cell_command cmd;
    int exitFlag = 0;
 
-   DEBUG_PRINTF("Enter command loop\n");
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
 
    ASSERT((sizeof(struct cell_command) & 0xf) == 0);
    ASSERT_ALIGN16(&cmd);
@@ -686,12 +681,12 @@ command_loop(void)
       unsigned opcode;
       int tag = 0;
 
-      DEBUG_PRINTF("Wait for cmd...\n");
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
 
-      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
 
       /* command payload */
       mfc_get(&cmd,  /* dest */
@@ -708,7 +703,7 @@ command_loop(void)
 
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
-         DEBUG_PRINTF("EXIT\n");
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
@@ -725,7 +720,7 @@ command_loop(void)
 
    }
 
-   DEBUG_PRINTF("Exit command loop\n");
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
 
    spu_dcache_report();
 }
diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
index eeec052655..25653dcdcd 100644
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -30,28 +30,19 @@
 #define SPU_DEBUG_H
 
 
-/* Set to 0 to disable all extraneous debugging code */
-#define DEBUG 1
-
 #if DEBUG
-extern boolean Debug;
-extern boolean force_fragment_ops_fallback;
 
 /* These debug macros use the unusual construction ", ##__VA_ARGS__"
  * which expands to the expected comma + args if variadic arguments
  * are supplied, but swallows the comma if there are no variadic
  * arguments (which avoids syntax errors that would otherwise occur).
  */
-#define DEBUG_PRINTF(format,...) \
-   if (Debug) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
 #define D_PRINTF(flag, format,...) \
    if (spu.init.debug_flags & (flag)) \
       printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
 
 #else
 
-#define DEBUG_PRINTF(...)
 #define D_PRINTF(...)
 
 #endif
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 4becd0f92a..c8bb251905 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -40,7 +40,6 @@
 #include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 //#include "spu_test.h"
-#include "spu_debug.h"
 #include "cell/common.h"
 
 
@@ -53,12 +52,6 @@ helpful headers:
 struct spu_global spu;
 
 
-#if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
-#endif
-
-
 static void
 one_time_init(void)
 {
@@ -102,7 +95,7 @@ main(main_param_t speid, main_param_t argp)
 
    one_time_init();
 
-   DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
    /* get initialization data */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index ca72baea8b..569b9e45d4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -36,6 +36,19 @@
 #include "pipe/p_state.h"
 
 
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
+
 
 #define MAX_WIDTH 1024
 #define MAX_HEIGHT 1024
@@ -187,8 +200,6 @@ struct spu_global
 
 
 extern struct spu_global spu;
-extern boolean Debug;
-
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 82dbeb26b7..cfff19b6c0 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -177,7 +177,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    uint i, j;
 
 
-   if (Debug) {
+#if 0
       printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
              "inline_vert=%u\n",
              spu.init.id,
@@ -190,7 +190,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
-   }
+#endif
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
@@ -293,7 +293,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   if (Debug)
+#if 0
       printf("SPU %u: RENDER done\n",
              spu.init.id);
+#endif
 }
-- 
cgit v1.2.3


From 79e96b3a77f7d5c7136b380abcc675c7242d0ffe Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 13:58:58 -0600
Subject: cell: move some CELL_MAX constants

---
 src/gallium/drivers/cell/common.h       |  6 +++++-
 src/gallium/drivers/cell/spu/spu_main.h | 11 ++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 8ae78265f2..d716a26175 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -68,6 +68,9 @@
 
 #define CELL_MAX_SAMPLERS 4
 #define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024    /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024   /**< max framebuffer width */
 
 #define TILE_SIZE 32
 
@@ -99,13 +102,14 @@
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
-
+/** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
 #define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
 
+/** Debug flags */
 #define CELL_DEBUG_CHECKER              (1 << 0)
 #define CELL_DEBUG_ASM                  (1 << 1)
 #define CELL_DEBUG_SYNC                 (1 << 2)
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 569b9e45d4..f87495b72d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -50,13 +50,6 @@
 #endif
 
 
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
-
-
-#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
-
-
 /**
  * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
  * The data may be addressed through several different types.
@@ -175,8 +168,8 @@ struct spu_global
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
    uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-- 
cgit v1.2.3


From ec7d6c656178babdf143faa242f7a3df9d0bc22c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 15 Oct 2008 14:39:16 -0600
Subject: cell: send rasterizer state to SPUs in proper way, remove
 front_winding hack

---
 src/gallium/drivers/cell/common.h              | 18 ++++++++++++++----
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  7 +++++++
 src/gallium/drivers/cell/ppu/cell_vbuf.c       |  1 -
 src/gallium/drivers/cell/spu/spu_command.c     |  8 ++++++++
 src/gallium/drivers/cell/spu/spu_main.h        |  1 +
 src/gallium/drivers/cell/spu/spu_render.c      |  2 +-
 src/gallium/drivers/cell/spu/spu_tri.c         |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.h         |  2 +-
 8 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 1f6f2d494b..0ff2c491fb 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -99,8 +99,9 @@
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
 #define CELL_CMD_STATE_FS_CONSTANTS  21
-#define CELL_CMD_VS_EXECUTE          22
-#define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
 
 /** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
@@ -156,13 +157,23 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
+   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
 };
 
 
+/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
+};
+
+
 /**
  * Clear framebuffer to the given value/color.
  */
@@ -238,7 +249,6 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
-   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index d2427584ba..e6387382f2 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -147,6 +147,13 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc(cell, sizeof(*rast));
+      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
       struct cell_command_fragment_program *fp
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 578ddf62dc..aa63435b93 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,7 +214,6 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
-      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 4febd5385b..d2c282a022 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -583,6 +583,14 @@ cmd_batch(uint opcode)
       case CELL_CMD_STATE_FS_CONSTANTS:
          pos = cmd_state_fs_constants(buffer, pos);
          break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 8;
+         }
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index f87495b72d..4099e52699 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -153,6 +153,7 @@ struct spu_global
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
 
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index cfff19b6c0..75a7f75abc 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
+         drawn += tri_draw(v0, v1, v2, tx, ty);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 2417db8960..1519b8cd7e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -775,7 +775,7 @@ determinant(const float *v0, const float *v1, const float *v2)
  */
 boolean
 tri_draw(const float *v0, const float *v1, const float *v2,
-         uint tx, uint ty, uint front_winding)
+         uint tx, uint ty)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -790,7 +790,7 @@ tri_draw(const float *v0, const float *v1, const float *v2,
     * which will be needed for front/back-face stencil application
     */
    float det = determinant(v0, v1, v2);
-   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+   setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index abc3d35160..aa694dd7c9 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From 926b8dbb3e86360e5968882df94785ae84d0ad43 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 09:00:05 -0600
Subject: cell: clean up various texture-related things

Distinguish among texture targets in codegen.
progs/demos/cubemap.c runs correctly now too.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 29 ++++++++++++++---
 src/gallium/drivers/cell/spu/spu_command.c | 24 ++++++--------
 src/gallium/drivers/cell/spu/spu_funcs.c   | 34 +++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_main.h    | 16 +++++-----
 src/gallium/drivers/cell/spu/spu_texture.c | 50 ++++++++++++++----------------
 src/gallium/drivers/cell/spu/spu_texture.h | 34 +++++++++-----------
 6 files changed, 107 insertions(+), 80 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3dfd5f673d..2b34cf1e23 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1337,16 +1337,33 @@ emit_function_call(struct codegen *gen,
 
 
 static boolean
-emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   const uint addr = lookup_function(gen->cell, "spu_txp");
+   const uint target = inst->InstructionExtTexture.Texture;
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint addr;
    int ch;
    int coord_regs[4], d_regs[4];
 
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
    assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
 
-   spe_comment(gen->f, -4, "CALL txp:");
+   spe_comment(gen->f, -4, "CALL tex:");
 
    /* get src/dst reg info */
    for (ch = 0; ch < 4; ch++) {
@@ -1368,7 +1385,7 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
          spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
       }
 
-      /* setup function arguments */
+      /* setup function arguments (XXX depends on target) */
       for (i = 0; i < 4; i++) {
          spe_move(gen->f, 3 + i, coord_regs[i]);
       }
@@ -1674,8 +1691,10 @@ emit_instruction(struct codegen *gen,
       /* fall-through for now */
    case TGSI_OPCODE_TXB:
       /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
    case TGSI_OPCODE_TXP:
-      return emit_TXP(gen, inst);
+      return emit_TEX(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 57d265fef7..ff4a52d79a 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -310,8 +310,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
  */
 static void
 update_tex_masks(struct spu_texture *texture,
-                 const struct pipe_sampler_state *sampler,
-                 uint unit)
+                 const struct pipe_sampler_state *sampler)
 {
    uint i;
 
@@ -338,11 +337,6 @@ update_tex_masks(struct spu_texture *texture,
          texture->level[i].scale_t = spu_splats(1.0f);
       }
    }
-
-   /* XXX temporary hack */
-   if (texture->target == PIPE_TEXTURE_CUBE) {
-      spu.sample_texture4[unit] = sample_texture4_cube;
-   }
 }
 
 
@@ -357,12 +351,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    switch (spu.sampler[unit].min_img_filter) {
    case PIPE_TEX_FILTER_LINEAR:
-      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
    case PIPE_TEX_FILTER_ANISO:
       /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
-      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
    default:
       ASSERT(0);
@@ -370,12 +364,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    switch (spu.sampler[sampler->unit].mag_img_filter) {
    case PIPE_TEX_FILTER_LINEAR:
-      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
    case PIPE_TEX_FILTER_ANISO:
       /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
-      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
    default:
       ASSERT(0);
@@ -384,16 +378,16 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    switch (spu.sampler[sampler->unit].min_mip_filter) {
    case PIPE_TEX_MIPFILTER_NEAREST:
    case PIPE_TEX_MIPFILTER_LINEAR:
-      spu.sample_texture4[unit] = sample_texture4_lod;
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
       break;
    case PIPE_TEX_MIPFILTER_NONE:
-      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
       break;
    default:
       ASSERT(0);
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
@@ -434,7 +428,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
          spu.texture[unit].max_level = i;
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 5c3ee305d4..3534b35000 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -43,6 +43,7 @@
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
+#include "spu_texture.h"
 
 
 /** For "return"-ing four vectors */
@@ -102,11 +103,34 @@ spu_log2(vector float x)
 
 
 static struct vec_4x4
-spu_txp(vector float s, vector float t, vector float r, vector float q,
-        unsigned unit)
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
    return colors;
 }
 
@@ -147,7 +171,9 @@ return_function_info(void)
    export_func(&funcs, "spu_pow", &spu_pow);
    export_func(&funcs, "spu_exp2", &spu_exp2);
    export_func(&funcs, "spu_log2", &spu_log2);
-   export_func(&funcs, "spu_txp", &spu_txp);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 4099e52699..80e9c696f8 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -70,12 +70,10 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef void (*spu_sample_texture4_func)(vector float s,
-                                         vector float t,
-                                         vector float r,
-                                         vector float q,
-                                         uint unit, uint level, uint face,
-                                         vector float colors[4]);
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
 
 
 /** Function for performing per-fragment ops */
@@ -183,9 +181,9 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
-   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
-   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 42eb06a362..04202a7657 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -126,10 +126,9 @@ spu_clamp(vector signed int vec, vector signed int max)
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
-sample_texture4_nearest(vector float s, vector float t,
-                        vector float r, vector float q,
-                        uint unit, uint level, uint face,
-                        vector float colors[4])
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    vector float ss = spu_mul(s, tlevel->scale_s);
@@ -158,10 +157,9 @@ sample_texture4_nearest(vector float s, vector float t,
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
-sample_texture4_bilinear(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, uint level, uint face,
-                         vector float colors[4])
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -308,10 +306,9 @@ transpose(vector unsigned int *mOut0,
  * Bilinear filtering, using int intead of float arithmetic
  */
 void
-sample_texture4_bilinear_2(vector float s, vector float t,
-                           vector float r, vector float q,
-                           uint unit, uint level, uint face,
-                           vector float colors[4])
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
@@ -444,10 +441,9 @@ compute_lambda(uint unit, vector float s, vector float t)
  * Texture sampling with level of detail selection.
  */
 void
-sample_texture4_lod(vector float s, vector float t,
-                    vector float r, vector float q,
-                    uint unit, uint level_ignored, uint face,
-                    vector float colors[4])
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -455,6 +451,9 @@ sample_texture4_lod(vector float s, vector float t,
     */
    float lambda = compute_lambda(unit, s, t);
 
+   (void) face;
+   (void) level_ignored;
+
    /* apply lod bias */
    lambda += spu.sampler[unit].lod_bias;
 
@@ -466,14 +465,14 @@ sample_texture4_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, 0, colors);
    }
    else {
       /* minify */
       int level = (int) (lambda + 0.5f);
       if (level > (int) spu.texture[unit].max_level)
          level = spu.texture[unit].max_level;
-      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
+      spu.min_sample_texture_2d[unit](s, t, unit, level, 0, colors);
       /* XXX to do: mipmap level interpolation */
    }
 }
@@ -552,13 +551,10 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
 
 
 void
-sample_texture4_cube(vector float s, vector float t,
-                     vector float r, vector float q,
-                     uint unit, uint level, uint face_ignored,
-                     vector float colors[4])
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
 {
-   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
-   uint p, faces[4];
+   uint p, faces[4], level = 0;
    float newS[4], newT[4];
 
    /* Compute cube face referenced by the four sets of texcoords.
@@ -577,15 +573,15 @@ sample_texture4_cube(vector float s, vector float t,
       /* GOOD!  All four texcoords refer to the same cube face */
       s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
       t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
-      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+      sample_texture_2d_nearest(s, t, unit, level, faces[0], colors);
    }
    else {
       /* BAD!  The four texcoords refer to different faces */
       for (p = 0; p < 4; p++) {      
          vector float c[4];
 
-         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
-                                 zero, zero, unit, level, faces[p], c);
+         sample_texture_2d_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                   unit, level, faces[p], c);
 
          float red = spu_extract(c[0], p);
          float green = spu_extract(c[1], p);
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 387484c3ad..7b75b007b5 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -37,37 +37,31 @@ invalidate_tex_cache(void);
 
 
 extern void
-sample_texture4_nearest(vector float s, vector float t,
-                        vector float r, vector float q,
-                        uint unit, uint level, uint face,
-                        vector float colors[4]);
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
 
 
 extern void
-sample_texture4_bilinear(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, uint level, uint face,
-                         vector float colors[4]);
-
-extern void
-sample_texture4_bilinear_2(vector float s, vector float t,
-                           vector float r, vector float q,
+sample_texture_2d_bilinear(vector float s, vector float t,
                            uint unit, uint level, uint face,
                            vector float colors[4]);
 
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
 
 extern void
-sample_texture4_lod(vector float s, vector float t,
-                    vector float r, vector float q,
-                    uint unit, uint level, uint face,
-                    vector float colors[4]);
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
 
 
 extern void
-sample_texture4_cube(vector float s, vector float t,
-                     vector float r, vector float q,
-                     uint unit, uint level_ignored, uint face_ignored,
-                     vector float colors[4]);
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 033c90f4c16c1da517d676282508208319bd5ec5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 16 Oct 2008 13:49:42 -0600
Subject: cell: implement KIL instruction

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 80 ++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.h    |  6 +--
 src/gallium/drivers/cell/spu/spu_tri.c     |  5 +-
 3 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 2b34cf1e23..493ee1a0c9 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -84,6 +84,9 @@ struct codegen
    /** Index of execution mask register */
    int exec_mask_reg;
 
+   /** KIL mask: indicates which fragments have been killed */
+   int kill_mask_reg;
+
    int frame_size;  /**< Stack frame size, in words */
 
    struct spe_function *f;
@@ -431,8 +434,21 @@ emit_prologue(struct codegen *gen)
 static void
 emit_epilogue(struct codegen *gen)
 {
+   const int return_reg = 3;
+
    spe_comment(gen->f, -4, "Function epilogue:");
 
+   spe_comment(gen->f, 0, "return the killed mask");
+   if (gen->kill_mask_reg > 0) {
+      /* shader called KIL, return the "alive" mask */
+      spe_move(gen->f, return_reg, gen->kill_mask_reg);
+   }
+   else {
+      /* return {0,0,0,0} */
+      spe_load_uint(gen->f, return_reg, 0);
+   }
+
+   spe_comment(gen->f, 0, "restore stack and return");
    if (gen->frame_size >= 512) {
       /* offset is too large for ai instruction */
       int offset_reg = spe_allocate_available_register(gen->f);
@@ -1423,6 +1439,68 @@ emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+   spe_comment(gen->f, -4, "CALL kil:");
+
+   /* zero = {0,0,0,0} */
+   zero_reg = get_itemp(gen);
+   spe_load_uint(gen->f, zero_reg, 0);
+
+   cmp_reg = get_itemp(gen);
+
+   /* get src regs */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      }
+   }
+
+   /* test if any src regs are < 0 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         if (kil_reg >= 0) {
+            /* cmp = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+            /* kil = kil | cmp */
+            spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+         }
+         else {
+            kil_reg = get_itemp(gen);
+            /* kil = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+         }
+      }
+   }
+
+   if (gen->if_nesting) {
+      /* may have been a conditional kil */
+      spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+   }
+
+   /* allocate the kill mask reg if needed */
+   if (gen->kill_mask_reg <= 0) {
+      gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+      spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+   }
+   else {
+      spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1695,6 +1773,8 @@ emit_instruction(struct codegen *gen,
       /* fall-through for now */
    case TGSI_OPCODE_TXP:
       return emit_TEX(gen, inst);
+   case TGSI_OPCODE_KIL:
+      return emit_KIL(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 80e9c696f8..95ef4c9244 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -89,9 +89,9 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       uint facing);
 
 /** Function for running fragment program */
-typedef void (*spu_fragment_program_func)(vector float *inputs,
-                                          vector float *outputs,
-                                          vector float *constants);
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+                                                         vector float *outputs,
+                                                         vector float *constants);
 
 
 struct spu_framebuffer
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index d83085d0f9..4caf7d6b61 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -254,6 +254,7 @@ emit_quad( int x, int y, mask_t mask)
          vector float inputs[4*4], outputs[2*4];
          vector float fragZ = eval_z((float) x, (float) y);
          vector float fragW = eval_w((float) x, (float) y);
+         vector unsigned int kill_mask;
 
          /* setup inputs */
 #if 0
@@ -268,7 +269,9 @@ emit_quad( int x, int y, mask_t mask)
          ASSERT(spu.fragment_ops);
 
          /* Execute the current fragment program */
-         spu.fragment_program(inputs, outputs, spu.constants);
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+         mask = spu_andc(mask, kill_mask);
 
          /* Execute per-fragment/quad operations, including:
           * alpha test, z test, stencil test, blend and framebuffer writing.
-- 
cgit v1.2.3


From 70dd4379d2cd54f229c3940312537912470218d3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 10:34:13 -0600
Subject: cell: implement fencing for texture buffers

If we delete a texture, we need to keep the underlying tiled data buffer
around until any rendering that references it has completed.
Keep a list of buffers referenced by a rendering batch.  Unref/free them when
the associated batch's fence is executed/signalled.
---
 src/gallium/drivers/cell/common.h              |  25 ++++
 src/gallium/drivers/cell/ppu/Makefile          |   1 +
 src/gallium/drivers/cell/ppu/cell_batch.c      |  32 +++++
 src/gallium/drivers/cell/ppu/cell_context.c    |   6 +
 src/gallium/drivers/cell/ppu/cell_context.h    |  21 ++++
 src/gallium/drivers/cell/ppu/cell_fence.c      | 158 +++++++++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_fence.h      |  57 +++++++++
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   2 +-
 src/gallium/drivers/cell/ppu/cell_texture.c    |  33 ++++--
 src/gallium/drivers/cell/ppu/cell_texture.h    |   5 +-
 src/gallium/drivers/cell/ppu/cell_vbuf.c       |   6 +
 src/gallium/drivers/cell/spu/spu_command.c     |  38 +++++-
 src/gallium/drivers/cell/spu/spu_main.h        |   2 +-
 13 files changed, 367 insertions(+), 19 deletions(-)
 create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.c
 create mode 100644 src/gallium/drivers/cell/ppu/cell_fence.h

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 9ca4e9d67e..23fb0b0831 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -102,6 +102,8 @@
 #define CELL_CMD_STATE_RASTERIZER    22
 #define CELL_CMD_VS_EXECUTE          23
 #define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
+
 
 /** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
@@ -123,6 +125,29 @@
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
 
 
+
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
+
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
+
+
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+};
+
+
 /**
  * Command to specify per-fragment operations state and generated code.
  * Note that the dsa, blend, blend_color fields are really only needed
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b28f4c5c31..9358a47284 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -24,6 +24,7 @@ SOURCES = \
 	cell_clear.c \
 	cell_context.c \
 	cell_draw_arrays.c \
+	cell_fence.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
 	cell_gen_fp.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 01254aed60..448b723d85 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -28,6 +28,7 @@
 
 #include "cell_context.h"
 #include "cell_batch.h"
+#include "cell_fence.h"
 #include "cell_spu.h"
 
 
@@ -63,6 +64,10 @@ cell_get_empty_buffer(struct cell_context *cell)
                printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
                */
                prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
                return buf;
             }
          }
@@ -84,6 +89,26 @@ cell_get_empty_buffer(struct cell_context *cell)
 }
 
 
+/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+}
+
+
 /**
  * Flush the current batch buffer to the SPUs.
  * An empty buffer will be found and set as the new current batch buffer
@@ -102,6 +127,12 @@ cell_batch_flush(struct cell_context *cell)
    if (size == 0)
       return;
 
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head)
+      emit_fence(cell);
+
    flushing = TRUE;
 
    assert(batch < CELL_NUM_BUFFERS);
@@ -142,6 +173,7 @@ uint
 cell_batch_free_space(const struct cell_context *cell)
 {
    uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
    return free;
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 7a2d93ecb4..22d552d8e3 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -47,6 +47,7 @@
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_surface.h"
@@ -104,6 +105,7 @@ cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
 {
    struct cell_context *cell;
+   uint i;
 
    /* some fields need to be 16-byte aligned, so align the whole object */
    cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -151,6 +153,10 @@ cell_create_context(struct pipe_screen *screen,
                                               cell_debug_flags, 
                                               0 );
 
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
    /*
     * SPU stuff
     */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index ad1f4829a4..4491ae8cdf 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -81,6 +81,19 @@ struct cell_fragment_ops_key
 };
 
 
+struct cell_buffer_node;
+
+/**
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
+ */
+struct cell_buffer_list
+{
+   struct cell_fence fence;
+   struct cell_buffer_node *head;
+};
+
+
 /**
  * Per-context state, subclass of pipe_context.
  */
@@ -154,6 +167,14 @@ struct cell_context
    uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..ffb3bea12b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,158 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
+      if (fence->status[i][0] == CELL_FENCE_EMITTED)
+         return FALSE;
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_buffer *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_buffer *buffer)
+{
+   struct pipe_screen *ps = cell->pipe.screen;
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_buffer_reference(ps, &node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         pipe_buffer_unmap(ps, node->buffer);
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->refcount == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_buffer_reference(ps, &node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_texture *ct = cell->texture[i];
+      if (ct) {
+         uint level;
+         for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+            if (ct->tiled_buffer[level]) {
+#if 0
+               printf("Adding texture %p buffer %p to list\n",
+                      ct, ct->tiled_buffer[level]);
+#endif
+               cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+            }
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index effcd2a1e1..dd2d7f7d1e 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -225,7 +225,7 @@ cell_emit_state(struct cell_context *cell)
             if (cell->texture[i]) {
                uint level;
                for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
-                  texture->start[level] = cell->texture[i]->tiled_data[level];
+                  texture->start[level] = cell->texture[i]->tiled_mapped[level];
                   texture->width[level] = cell->texture[i]->base.width[level];
                   texture->height[level] = cell->texture[i]->base.height[level];
                   texture->depth[level] = cell->texture[i]->base.depth[level];
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9c6741f1bc..9ac2f3bbb9 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -136,6 +136,9 @@ cell_texture_release(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
+      /* Delete this texture now.
+       * But note that the underlying pipe_buffer may linger...
+       */
       struct cell_texture *ct = cell_texture(*pt);
       uint i;
 
@@ -146,14 +149,12 @@ cell_texture_release(struct pipe_screen *screen,
       pipe_buffer_reference(screen, &ct->buffer, NULL);
 
       for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
-         if (ct->tiled_data[i]) {
-            /* XXX need to use a fenced buffer for tiled data so that
-             * it's properly freed after rendering has completed.
-             * Disabling this free() allows glDrawPixels to work for now.
-             */
-#if 0
-            align_free(ct->tiled_data[i]);
-#endif
+         /* Unreference the tiled image buffer.
+          * It may not actually be deleted until a fence is hit.
+          */
+         if (ct->tiled_buffer[i]) {
+            ct->tiled_mapped[i] = NULL;
+            winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL);
          }
       }
 
@@ -234,12 +235,18 @@ cell_twiddle_texture(struct pipe_screen *screen,
          int offset = bufWidth * bufHeight * 4 * surface->face;
          uint *dst;
 
-         if (!ct->tiled_data[level]) {
-            ct->tiled_data[level] =
-               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         if (!ct->tiled_buffer[level]) {
+            /* allocate buffer for tiled data now */
+            struct pipe_winsys *ws = screen->winsys;
+            uint bytes = bufWidth * bufHeight * 4 * numFaces;
+            ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
+                                                        PIPE_BUFFER_USAGE_PIXEL,
+                                                        bytes);
+            /* and map it */
+            ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
+                                                     PIPE_BUFFER_USAGE_GPU_READ);
          }
-
-         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+         dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
 
          twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
                             surface->stride, src);
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index a0757091b0..2f5fe0dd1b 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -48,7 +48,10 @@ struct cell_texture
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
+   /** Texture data in tiled layout is held here */
+   struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+   /** Mapped, tiled texture data */
+   void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..65ba51b6bb 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -38,6 +38,7 @@
 
 #include "cell_batch.h"
 #include "cell_context.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_vbuf.h"
@@ -108,6 +109,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
    */
 
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
       struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 9c853c0961..a6ed29ea63 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -76,9 +76,10 @@ static void
 release_buffer(uint buffer)
 {
    /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
    const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
    uint *dst = spu.init.buffer_status + index;
 
@@ -93,6 +94,29 @@ release_buffer(uint buffer)
 }
 
 
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
@@ -637,6 +661,14 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 8;
+         }
+         break;
       case CELL_CMD_RELEASE_VERTS:
          {
             struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 95ef4c9244..668af10be2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -210,7 +210,7 @@ extern struct spu_global spu;
 #define TAG_DCACHE1           21
 #define TAG_DCACHE2           22
 #define TAG_DCACHE3           23
-
+#define TAG_FENCE             24
 
 
 static INLINE void
-- 
cgit v1.2.3


From 711f8a1dd94e2e1e715615d947e03015ef972326 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 30 Oct 2008 15:24:23 -0600
Subject: CELL: stencil bug fixes

Two definitive bugs in stenciling were fixed.

The first, reversed registers in the generated Select Bytes (selb)
instruction, caused the stenciling INCR and DECR operations to
fail dramatically, putting new values in where old values were
supposed to be and vice versa.

The second caused stencil tiles to not be read and written from
main memory by the SPUs.  A per-spu flag, spu.read_depth, was used
to indicate whether the SPU should be reading depth tiles, and was set
only when depth was enabled.  A second flag, spu.read_stencil, was
set when stenciling was enabled, but never referenced.

As stenciling and depth are in the same tiles on the Cell, and there
is no corresponding TAG_WRITE_TILE_STENCIL to complement
TAG_WRITE_TILE_COLOR and TAG_WRITE_TILE_Z, I fixed this by
eliminating the unused "spu.read_stencil", renaming "spu.read_depth"
to "spu.read_depth_stencil", and setting it if either stenciling or
depth is enabled.

I also added an optimization to the fragment ops generation code,
that avoids calculating stencil values and/or stencil writemask
when the stencil operations are all KEEP.
---
 progs/trivial/tri-stencil.c                      | 13 ++++++++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 25 ++++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.c       |  3 +--
 src/gallium/drivers/cell/spu/spu_main.h          |  3 +--
 src/gallium/drivers/cell/spu/spu_render.c        |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.c           |  2 +-
 6 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/progs/trivial/tri-stencil.c b/progs/trivial/tri-stencil.c
index 5edbef26ce..7686e16aef 100644
--- a/progs/trivial/tri-stencil.c
+++ b/progs/trivial/tri-stencil.c
@@ -49,7 +49,15 @@ static void Key(unsigned char key, int x, int y)
 
     switch (key) {
       case 27:
+        printf("Exiting...\n");
 	exit(1);
+      case 'r':
+        printf("Redisplaying...\n");
+        glutPostRedisplay();
+        break;
+      default:
+        printf("No such key '%c'...\n", key);
+        break;
     }
 }
 
@@ -89,7 +97,7 @@ static void Draw(void)
    glEnd();
 #endif
 
-#if 0
+#if 1
    glStencilFunc(GL_EQUAL, 1, 1);
    glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);
 
@@ -130,7 +138,8 @@ int main(int argc, char **argv)
 	exit(1);
     }
 
-    glutInitWindowPosition(0, 0); glutInitWindowSize( 300, 300);
+    glutInitWindowPosition(0, 0); 
+    glutInitWindowSize( 300, 300);
 
     type = GLUT_RGB | GLUT_SINGLE | GLUT_DEPTH | GLUT_STENCIL;
     glutInitDisplayMode(type);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 4e1e53ecdc..8e4dd82404 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1282,7 +1282,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
       spe_ai(f, newS_reg, fbS_reg, 1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1295,7 +1295,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate with a (-1) value works */
       spe_ai(f, newS_reg, fbS_reg, -1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1534,15 +1534,28 @@ gen_stencil_depth_test(struct spe_function *f,
     * meaning that we have to calculate the stencil values but do not
     * need to mask them), we can avoid generating code.  Don't forget
     * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
-      /* Trivial: don't need to calculate stencil values, and don't need to 
-       * write them back to the framebuffer.
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+       /* No changes to any stencil values */
+       need_to_calculate_stencil_values = false;
+       need_to_writemask_stencil_values = false;
+    }
+    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 63818d4c46..d726622d94 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       }
    }
 
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 668af10be2..692790c9f3 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -160,8 +160,7 @@ struct spu_global
    tile_t ztile ALIGN16_ATTRIB;
 
    /** Read depth/stencil tiles? */
-   boolean read_depth;
-   boolean read_stencil;
+   boolean read_depth_stencil;
 
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5515bb55c9..7c225e2f27 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 4caf7d6b61..5f908159bb 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -369,7 +369,7 @@ flush_spans(void)
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
-- 
cgit v1.2.3


From 90027f85786406133a5180998a75fb612b6a221e Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Tue, 11 Nov 2008 13:57:10 -0700
Subject: CELL: two-sided stencil fixes

With these changes, the tests/stencil_twoside test now works.

- Eliminate blending from the stencil_twoside test, as it produces an
  unneeded dependency on having blending working

- The spe_splat() function will now work if the register being splatted
  and the destination register are the same

- Separate fragment code generated for front-facing and back-facing
  fragments.  Often these are the same; if two-sided stenciling is on,
  they can be different.  This is easier and faster than generating
  code that does both tests and merges the results.

- Fixed a cut/paste bug where if the back Z-pass stencil operation
  were different from all the other operations, the back Z-fail
  results were incorrect.
---
 progs/tests/stencil_twoside.c                      |   2 -
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        |   7 +-
 src/gallium/drivers/cell/common.h                  |   6 +-
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 239 ++++++---------------
 src/gallium/drivers/cell/ppu/cell_gen_fragment.h   |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  19 +-
 src/gallium/drivers/cell/spu/spu_command.c         |   6 +-
 src/gallium/drivers/cell/spu/spu_main.c            |   6 +-
 src/gallium/drivers/cell/spu/spu_main.h            |  10 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  20 +-
 12 files changed, 115 insertions(+), 208 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/progs/tests/stencil_twoside.c b/progs/tests/stencil_twoside.c
index be9d9a776a..8826c46fc2 100644
--- a/progs/tests/stencil_twoside.c
+++ b/progs/tests/stencil_twoside.c
@@ -115,7 +115,6 @@ static void Display( void )
    glVertex2f(-1,  1);
    glEnd();
 
-
    if (use20syntax) {
       stencil_func_separate(GL_FRONT, GL_ALWAYS, 0, ~0);
       stencil_func_separate(GL_BACK, GL_ALWAYS, 0, ~0);
@@ -279,7 +278,6 @@ static void Init( void )
    stencil_op_separate = glutGetProcAddress( "glStencilOpSeparate" );
 
    printf("\nAll 5 squares should be the same color.\n");
-   glEnable( GL_BLEND );
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index f8568f690b..1bd9f1c8dd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -958,9 +958,12 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
+   /* Use a temporary, just in case rT == rA */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
    /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
-   spe_ila(p, rT, 0x00010203);
-   spe_shufb(p, rT, rA, rA, rT);
+   spe_ila(p, tmp_reg, 0x00010203);
+   spe_shufb(p, rT, rA, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
 }
 
 
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 87488ea2d7..a670ed3c6e 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -130,6 +130,9 @@
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
 
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
+
 struct cell_fence
 {
    /** There's a 16-byte status qword per SPU */
@@ -160,7 +163,8 @@ struct cell_command_fragment_ops
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
+   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index d9c3ff3f4d..6e425eafaa 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1412,144 +1412,72 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
  * and released by the corresponding spe_release_register_set() call.
  */
 static void
-gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil,
+                       const unsigned int depth_enabled,
                        unsigned int fbS_reg, 
                        unsigned int *fail_reg, unsigned int *zfail_reg, 
-                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
-                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+                       unsigned int *zpass_reg)
 {
-   unsigned zfail_op, back_zfail_op;
+   unsigned zfail_op;
 
    /* Stenciling had better be enabled here */
-   ASSERT(dsa->stencil[0].enabled);
+   ASSERT(stencil->enabled);
 
    /* If the depth test is not enabled, it is treated as though it always
-    * passes.  In particular, that means that the "zfail_op" (and the backfacing
-    * counterpart, if active) are not considered - a failing stencil test will
-    * trigger the "fail_op", and a passing stencil test will trigger the
-    * "zpass_op".
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
     *
-    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
-    * we keep them from being calculated.
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
     */
-   if (dsa->depth.enabled) {
-      zfail_op = dsa->stencil[0].zfail_op;
-      back_zfail_op = dsa->stencil[1].zfail_op;
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
    }
    else {
       zfail_op = PIPE_STENCIL_OP_KEEP;
-      back_zfail_op = PIPE_STENCIL_OP_KEEP;
    }
 
    /* One-sided or front-facing stencil */
-   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
       *fail_reg = fbS_reg;
    }
    else {
       *fail_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 
          0xff, fbS_reg, *fail_reg);
    }
 
+   /* Check the possibly overridden value, not the structure value */
    if (zfail_op == PIPE_STENCIL_OP_KEEP) {
       *zfail_reg = fbS_reg;
    }
-   else if (zfail_op == dsa->stencil[0].fail_op) {
+   else if (zfail_op == stencil->fail_op) {
       *zfail_reg = *fail_reg;
    }
    else {
       *zfail_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 
          0xff, fbS_reg, *zfail_reg);
    }
 
-   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
       *zpass_reg = fbS_reg;
    }
-   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+   else if (stencil->zpass_op == stencil->fail_op) {
       *zpass_reg = *fail_reg;
    }
-   else if (dsa->stencil[0].zpass_op == zfail_op) {
+   else if (stencil->zpass_op == zfail_op) {
       *zpass_reg = *zfail_reg;
    }
    else {
       *zpass_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 
          0xff, fbS_reg, *zpass_reg);
    }
-
-   /* If two-sided stencil is enabled, we have more work to do. */
-   if (!dsa->stencil[1].enabled) {
-      /* This just flags that the registers need not be deallocated later */
-      *back_fail_reg = fbS_reg;
-      *back_zfail_reg = fbS_reg;
-      *back_zpass_reg = fbS_reg;
-   }
-   else {
-      /* Same calculations as above, but for the back stencil */
-      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
-         *back_fail_reg = fbS_reg;
-      }
-      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
-         *back_fail_reg = *fail_reg;
-      }
-      else if (dsa->stencil[1].fail_op == zfail_op) {
-         *back_fail_reg = *zfail_reg;
-      }
-      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
-         *back_fail_reg = *zpass_reg;
-      }
-      else {
-         *back_fail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_fail_reg);
-      }
-
-      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
-         *back_zfail_reg = fbS_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[0].fail_op) {
-         *back_zfail_reg = *fail_reg;
-      }
-      else if (back_zfail_op == zfail_op) {
-         *back_zfail_reg = *zfail_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
-         *back_zfail_reg = *zpass_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[1].fail_op) {
-         *back_zfail_reg = *back_fail_reg;
-      }
-      else {
-         *back_zfail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_zfail_reg);
-      }
-
-      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
-         *back_zpass_reg = fbS_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
-         *back_zpass_reg = *fail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == zfail_op) {
-         *back_zpass_reg = *zfail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
-         *back_zpass_reg = *zpass_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
-         *back_zpass_reg = *back_fail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
-         *back_zpass_reg = *back_zfail_reg;
-      }
-      else {
-         *back_zfail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_zpass_reg);
-      }
-   } /* End of calculations for back-facing stencil */
 }
 
 /* Note that fbZ_reg may *not* be set on entry, if in fact
@@ -1559,7 +1487,7 @@ gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_a
 static boolean
 gen_stencil_depth_test(struct spe_function *f, 
                        const struct pipe_depth_stencil_alpha_state *dsa, 
-                       const int const facing_reg,
+                       const uint facing,
                        const int mask_reg, const int fragZ_reg, 
                        const int fbZ_reg, const int fbS_reg)
 {
@@ -1571,6 +1499,8 @@ gen_stencil_depth_test(struct spe_function *f,
    boolean need_to_calculate_stencil_values;
    boolean need_to_writemask_stencil_values;
 
+   struct pipe_stencil_state *stencil;
+
    /* Registers.  We may or may not actually allocate these, depending
     * on whether the state values indicate that we need them.
     */
@@ -1598,6 +1528,20 @@ gen_stencil_depth_test(struct spe_function *f,
    spe_comment(f, 0, "Allocating stencil register set");
    spe_allocate_register_set(f);
 
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+   }
+
    /* Calculate the writemask.  If the writemask is trivial (either
     * all 0s, meaning that we don't need to calculate any stencil values
     * because they're not going to change the stencil anyway, or all 1s,
@@ -1608,24 +1552,20 @@ gen_stencil_depth_test(struct spe_function *f,
     * Note that if the backface stencil is *not* enabled, the backface
     * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
-       /* No changes to any stencil values */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
        need_to_calculate_stencil_values = false;
        need_to_writemask_stencil_values = false;
     }
-    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+    else if (stencil->write_mask == 0x0) {
       /* All changes are writemasked out, so no need to calculate
        * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
+   else if (stencil->write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
@@ -1645,14 +1585,7 @@ gen_stencil_depth_test(struct spe_function *f,
        */
       spe_comment(f, 0, "Computing stencil writemask");
       stencil_writemask_reg = spe_allocate_available_register(f);
-      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
-      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
-         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
-         spe_comment(f, 0, "Resolving two-sided stencil writemask");
-         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
-         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
-         spe_release_register(f, back_write_mask_reg);
-      }
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
    }
 
    /* At least one-sided stenciling must be on.  Generate code that
@@ -1666,19 +1599,7 @@ gen_stencil_depth_test(struct spe_function *f,
     */
    spe_comment(f, 0, "Running basic stencil test");
    stencil_pass_reg = spe_allocate_available_register(f);
-   gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
-
-   /* If two-sided stenciling is on, generate code to run the stencil
-    * test on the backfacing stencil as well, and combine the two results
-    * into the one correct result based on facing.
-    */
-   if (dsa->stencil[1].enabled) {
-      unsigned int temp_reg = spe_allocate_available_register(f);
-      spe_comment(f, 0, "Running backface stencil test");
-      gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
-      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
-      spe_release_register(f, temp_reg);
-   }
+   gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
 
    /* Generate code that, given the mask of valid fragments and the
     * mask of valid fragments that passed the stencil test, computes
@@ -1698,9 +1619,6 @@ gen_stencil_depth_test(struct spe_function *f,
 
    /* We may not need to calculate stencil values, if the writemask is off */
    if (need_to_calculate_stencil_values) {
-      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
-      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
-
       /* Generate code that calculates exactly which stencil values we need,
        * without calculating the same value twice (say, if two different
        * stencil ops have the same value).  This code will work for one-sided
@@ -1715,51 +1633,11 @@ gen_stencil_depth_test(struct spe_function *f,
        * This function will allocate a variant number of registers that
        * will be released as part of the register set.
        */
-      spe_comment(f, 0, "Computing stencil values");
-      gen_get_stencil_values(f, dsa, fbS_reg, 
-         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
-         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
-         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
-
-      /* Tricky, tricky, tricky - the things we do to create optimal
-       * code...
-       *
-       * The various stencil values registers may overlap with each other
-       * and with fbS_reg arbitrarily (as any particular operation is
-       * only calculated once and stored in one register, no matter
-       * how many times it is used).  So we can't change the values 
-       * within those registers directly - if we change a value in a
-       * register that's being referenced by two different calculations,
-       * we've just unwittingly changed the second value as well...
-       *
-       * Avoid this by allocating new registers to hold the results
-       * (there may be 2, if the depth test is off, or 3, if it is on).
-       * These will be released as part of the register set.
-       */
-      if (!dsa->stencil[1].enabled) {
-         /* The easy case: if two-sided stenciling is *not* enabled, we
-          * just use the front-sided values.
-          */
-         stencil_fail_values = front_stencil_fail_values;
-         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
-         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
-      }
-      else { /* two-sided stencil enabled */
-         spe_comment(f, 0, "Resolving backface stencil values");
-         /* Allocate new registers for the needed merged values */
-         stencil_fail_values = spe_allocate_available_register(f);
-         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
-         if (dsa->depth.enabled) {
-            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
-            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
-         }
-         else {
-            stencil_pass_depth_fail_values = fbS_reg;
-         }
-         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
-         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
-      }
-   }
+      spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
 
    /* We now have all the stencil values we need.  We also need 
     * the results of the depth test to figure out which
@@ -1896,10 +1774,12 @@ gen_stencil_depth_test(struct spe_function *f,
  * should be much faster.
  *
  * \param cell  the rendering context (in)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
  * \param f     the generated function (out)
  */
 void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f)
 {
    const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
    const struct pipe_blend_state *blend = cell->blend;
@@ -1917,7 +1797,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
-   const int facing_reg = 13; /* uint */
+
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1945,7 +1826,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
-   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1969,6 +1849,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
+   /* Generate the alpha test, if needed. */
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
@@ -2095,7 +1976,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
           * gen_stencil_depth_test() function must ignore the
           * fbZ_reg register if depth is not enabled.
           */
-         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index b59de198dc..2fabfdfb08 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -31,7 +31,7 @@
 
 
 extern void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f);
 
 
 #endif /* CELL_GEN_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index dd2d7f7d1e..031b27f11f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -75,23 +75,29 @@ lookup_fragment_ops(struct cell_context *cell)
     * If not found, create/save new fragment ops command.
     */
    if (!ops) {
-      struct spe_function spe_code;
+      struct spe_function spe_code_front, spe_code_back;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
       /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
 
-      /* generate new code */
-      cell_gen_fragment_function(cell, &spe_code);
+      /* generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
       /* alloc new fragment ops command */
       ops = CALLOC_STRUCT(cell_command_fragment_ops);
 
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
+      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
 
@@ -99,7 +105,8 @@ lookup_fragment_ops(struct cell_context *cell)
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
 
       /* release rtasm buffer */
-      spe_release_func(&spe_code);
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
    }
    else {
       if (0)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d726622d94..d5faf4e3aa 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -214,7 +214,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
@@ -234,7 +235,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
     * raw state records that the fallback code requires.
     */
    if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
+      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
    }
    else {
       /* otherwise, the default fallback code remains in place */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index c8bb251905..7033f6037d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -63,7 +63,8 @@ one_time_init(void)
     * This will normally be overriden by a code-gen'd function
     * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
     */
-   spu.fragment_ops = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
 
@@ -90,7 +91,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 692790c9f3..24cf7d77ce 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -85,8 +85,7 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask,
-                                      uint facing);
+                                      vector unsigned int mask);
 
 /** Function for running fragment program */
 typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
@@ -170,9 +169,10 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   /** Current fragment ops function */
-   spu_fragment_ops_func fragment_ops;
+   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
 
    /** Current fragment program machine code, at 8-byte boundary */
    uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f8ffc70492..683664e8a4 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -75,8 +75,7 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask,
-                          uint facing)
+                          vector unsigned int mask)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index a61689c83a..f817abf046 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,8 +38,7 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask,
-                          uint facing);
+                          vector unsigned int mask);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 5f908159bb..22e51a86ae 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -275,15 +275,20 @@ emit_quad( int x, int y, mask_t mask)
 
          /* Execute per-fragment/quad operations, including:
           * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
           */
-         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                           fragZ,
                           outputs[0*4+0],
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask,
-                          setup.facing);
+                          mask);
       }
    }
 }
@@ -519,7 +524,14 @@ setup_sort_vertices(const struct vertex_header *v0,
 
    setup.oneOverArea = 1.0f / area;
 
-   /* The product of area * sign indicates front/back orientation (0/1) */
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
+    */
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
    setup.facing = (area * sign > 0.0f)
       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
-- 
cgit v1.2.3


From 11fc390f6478526d4f0bdb4b7e628284da31b3b9 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 21 Nov 2008 11:42:14 -0700
Subject: CELL: use variant-length fragment ops programs

This is a set of changes that optimizes the memory use of fragment
operation programs (by using and transmitting only as much memory as is
needed for the fragment ops programs, instead of maximal sizes), as well
as eliminate the dependency on hard-coded maximal program sizes.  State
that is not dependent on fragment facing (i.e. that isn't using
two-sided stenciling) will only save and transmit a single
fragment operation program, instead of two identical programs.

- Added the ability to emit a LNOP (No Operation (Load)) instruction.
  This is used to pad the generated fragment operations programs to
  a multiple of 8 bytes, which is necessary for proper operation of
  the dual instruction pipeline, and also required for proper SPU-side
  decoding.

- Added the ability to allocate and manage a variant-length
  struct cell_command_fragment_ops.  This structure now puts the
  generated function field at the end, where it can be as large
  as necessary.

- On the PPU side, we now combine the generated front-facing and
  back-facing code into a single variant-length buffer (and only use one
  if the two sets of code are identical) for transmission to the SPU.

- On the SPU side, we pull the correct sizes out of the buffer,
  allocate a new code buffer if the one we have isn't large enough,
  and save the code to that buffer.  The buffer is deallocated when
  the SPU exits.

- Commented out the emit_fetch() static function, which was not being used.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |   7 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  11 ++-
 src/gallium/auxiliary/util/u_memory.h            |   2 +
 src/gallium/drivers/cell/common.h                |  31 +++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |   7 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  77 ++++++++++++++--
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |   3 +
 src/gallium/drivers/cell/spu/spu_command.c       | 111 +++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.h       |  32 ++++++-
 src/gallium/drivers/cell/spu/spu_main.c          |  15 +--
 src/gallium/drivers/cell/spu/spu_main.h          |   4 +-
 11 files changed, 232 insertions(+), 68 deletions(-)

(limited to 'src/gallium/drivers/cell/spu/spu_main.h')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 1bd9f1c8dd..b9a75ae559 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -341,7 +341,11 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-
+#define EMIT(_name, _op) \
+void _name (struct spe_function *p) \
+{ \
+   emit_RR(p, _op, 0, 0, 0, __FUNCTION__); \
+}
 
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
@@ -713,7 +717,6 @@ hbrr;
 #if 0
 stop;
 EMIT_RR  (spe_stopd, 0x140);
-EMIT_    (spe_lnop,  0x001);
 EMIT_    (spe_nop,   0x201);
 sync;
 EMIT_    (spe_dsync, 0x003);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7c211ffc51..f9ad2acacd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -99,7 +99,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 #endif /* RTASM_PPC_SPE_H */
 
-#ifndef EMIT_
+#ifndef EMIT
+#define EMIT(_name, _op) \
+    extern void _name (struct spe_function *p);
 #define EMIT_(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT);
 #define EMIT_R(_name, _op) \
@@ -129,7 +131,7 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_I16(_name, _op) \
     extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
-#endif /* EMIT_ */
+#endif /* EMIT */
 
 
 /* Memory load / store instructions
@@ -294,6 +296,10 @@ EMIT_RI16(spe_brz,       0x040)
 EMIT_RI16(spe_brhnz,     0x046)
 EMIT_RI16(spe_brhz,      0x044)
 
+/* Control instructions
+ */
+EMIT     (spe_lnop,      0x001)
+
 extern void
 spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
 
@@ -418,6 +424,7 @@ EMIT_R   (spe_wrch,       0x10d)
 
 
 #ifdef UNDEF_EMIT_MACROS
+#undef EMIT
 #undef EMIT_
 #undef EMIT_R
 #undef EMIT_RR
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 857102719d..1a6b596421 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -151,6 +151,8 @@ REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
 
 #define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
 
+#define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size)   ((struct T *) CALLOC(1, sizeof(struct T) + more_size))
+
 
 /**
  * Return memory on given byte alignment
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index a670ed3c6e..98554d7f52 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -121,11 +121,6 @@
 #define CELL_DEBUG_CMD                  (1 << 5)
 #define CELL_DEBUG_CACHE                (1 << 6)
 
-/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 128
-
-
-
 #define CELL_FENCE_IDLE      0
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
@@ -153,18 +148,36 @@ struct cell_command_fence
 
 /**
  * Command to specify per-fragment operations state and generated code.
- * Note that the dsa, blend, blend_color fields are really only needed
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
  * for the fallback/C per-pixel code.  They're not used when we generate
- * dynamic SPU fragment code (which is the normal case).
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
-   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 82336d6635..2c64eb1bcc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1776,7 +1776,10 @@ gen_stencil_depth_test(struct spe_function *f,
  * \param cell  the rendering context (in)
  * \param facing whether the generated code is for front-facing or 
  *              back-facing fragments
- * \param f     the generated function (out)
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
  */
 void
 cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
@@ -1808,8 +1811,6 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-
    if (cell->debug_flags & CELL_DEBUG_ASM) {
       spe_print_code(f, true);
       spe_indent(f, 8);
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 031b27f11f..0a0af81f53 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -76,30 +76,86 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    if (!ops) {
       struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
-      /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
 
-      /* generate new code.  Always generate new code for both front-facing
+      /* Generate new code.  Always generate new code for both front-facing
        * and back-facing fragments, even if it's the same code in both
        * cases.
        */
       cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
       cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
-      /* alloc new fragment ops command */
-      ops = CALLOC_STRUCT(cell_command_fragment_ops);
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
 
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
-      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
 
       /* insert cell_command_fragment_ops object into keymap/cache */
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
@@ -200,9 +256,10 @@ cell_emit_state(struct cell_context *cell)
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
       struct cell_command_fragment_ops *fops, *fops_cmd;
-      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+      /* Note that cell_command_fragment_ops is a variant-sized record */
       fops = lookup_fragment_ops(cell);
-      memcpy(fops_cmd, fops, sizeof(*fops));
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 18969005b0..9cba537d9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p,
 }
 
 
+#if 0
+/* This appears to not be used currently */
 static void
 emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
@@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p,
       spe_release_register(p, float_one);
    }
 }
+#endif
 
 
 void cell_update_vertex_fetch(struct draw_context *draw)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d5faf4e3aa..8500d19754 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -210,45 +210,72 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
-   static int warned = 0;
-
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
    memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
-   /* Parity twist!  For now, always use the fallback code by default,
-    * only switching to codegen when specifically requested.  This
-    * allows us to develop freely without risking taking down the
-    * branch.
-    *
-    * Later, the parity of this check will be reversed, so that
-    * codegen is *always* used, unless we specifically indicate that
-    * we don't want it.
-    *
-    * Eventually, the option will be removed completely, because in
-    * final code we'll always use codegen and won't even provide the
-    * raw state records that the fallback code requires.
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
     */
-   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
-      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
-   }
-   else {
-      /* otherwise, the default fallback code remains in place */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
       if (!warned) {
          fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
          warned = 1;
       }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
    }
 
-   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
-}
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
 
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
 
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
@@ -588,7 +615,8 @@ cmd_batch(uint opcode)
             struct cell_command_fragment_ops *fops
                = (struct cell_command_fragment_ops *) &buffer[pos];
             cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
+            /* This is a variant-sized command */
+            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_PROGRAM:
@@ -756,3 +784,32 @@ command_loop(void)
    if (spu.init.debug_flags & CELL_DEBUG_CACHE)
       spu_dcache_report();
 }
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
index 853e9aa549..83dcdade28 100644
--- a/src/gallium/drivers/cell/spu/spu_command.h
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -1,7 +1,35 @@
-
-
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 extern void
 command_loop(void);
 
+extern void
+spu_command_init(void);
 
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 7033f6037d..97c86d194d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,17 +58,8 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
-
-   /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function
-    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
-    */
-   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
-   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
-
-
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -91,11 +82,11 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
+   spu_command_init();
 
    D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
@@ -120,5 +111,7 @@ main(main_param_t speid, main_param_t argp)
 
    command_loop();
 
+   spu_command_close();
+
    return 0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 24cf7d77ce..33767e7c51 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -169,8 +169,8 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
    /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
    spu_fragment_ops_func fragment_ops[2];
 
-- 
cgit v1.2.3